# Extract Novi Sad University Faculties (name + website)

This notebook scrapes https://www.uns.ac.rs/index.php/en/faculties and extracts for each faculty:

- faculty_name
- website (as shown near the faculty name)

The results are saved as `university_novisad_faculties.csv` in this folder.

In [2]:
# Imports and constants
import re, time
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse

UNS_URL = "https://www.uns.ac.rs/index.php/en/faculties"



In [3]:
# Fetch page (with retry)
def fetch_html(url: str, timeout: int = 30, retries: int = 3, backoff: float = 1.5) -> str:
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36',
        'Accept-Language': 'en,en-US;q=0.8',
    }
    err = None
    for i in range(retries):
        try:
            resp = requests.get(url, headers=headers, timeout=timeout)
            resp.raise_for_status()
            return resp.text
        except Exception as e:
            err = e
            time.sleep(backoff ** (i + 1))
    raise RuntimeError(f"Failed to fetch {url}: {err}")

html = fetch_html(UNS_URL)

In [19]:
# Parse faculties by scanning anchors with uns.ac.rs and backtracking to name
URL_RE = re.compile(r"((?:https?://)?(?:www\.)?[\w.-]+\.[a-z]{2,}(?:/[\w\-./?%&=]*)?)", re.I)

def extract_domain(u: str) -> str:
    if not u:
        return ''
    if u.startswith('mailto:') or u.startswith('tel:'):
        return ''
    if not u.lower().startswith(('http://','https://')):
        u = 'https://' + u.lstrip('/')
    try:
        return urlparse(u).netloc.lower()
    except Exception:
        return ''

def is_faculty_name(text: str) -> bool:
    t = (text or '').lower()
    return ('faculty of' in t) or ('academy of' in t) or ('technical faculty' in t) or ('teacher training faculty' in t)

def looks_like_site(s: str) -> bool:
    return bool(URL_RE.search(s or ''))

def parse_faculties(html: str):
    soup = BeautifulSoup(html, 'html.parser')
    results = []
    seen = set()
    for a in soup.find_all('a', href=True):
        href = a['href']
        txt = a.get_text(' ', strip=True)
        if href.startswith('mailto:') or href == '#':
            continue
        if 'uns.ac.rs' not in href and 'uns.ac.rs' not in txt:
            continue
        # Accept only faculty subdomains (not main uns.ac.rs)
        dom = extract_domain(href if 'uns.ac.rs' in href else txt)
        if not dom or dom in {'uns.ac.rs','www.uns.ac.rs'}:
            continue
        # find nearest block container
        block = a.find_parent(['li','p','div','section','article']) or a.parent
        name = ''
        # Try strong/b within block
        if block:
            sb = block.find(['strong','b'])
            if sb:
                name = sb.get_text(' ', strip=True)
        # Fallback: look for anchor with Faculty-like text in block
        if not name and block:
            for aa in block.find_all('a'):
                t = aa.get_text(' ', strip=True)
                if is_faculty_name(t):
                    name = t
                    break
        # Last resort: search text in block for a line starting with 'Faculty'
        if not name and block:
            t = block.get_text(' ', strip=True)
            m = re.search(r'(Faculty[^|\n]+)', t, flags=re.I)
            if m:
                name = m.group(1).strip()
        if not name or not is_faculty_name(name):
            continue
        site = txt if looks_like_site(txt) else href
        key = (name, site)
        if key in seen:
            continue
        seen.add(key)
        results.append({'faculty_name': name, 'website': site})
    return results

faculties = parse_faculties(html)

In [20]:
# Clean and tabulate (finalize)
def domain_of(url_like: str) -> str:
    u = url_like.strip()
    if u.startswith('mailto:') or u.startswith('tel:'):
        return ''
    if not u.lower().startswith(('http://','https://')):
        u = 'https://' + u
    try:
        netloc = urlparse(u).netloc.lower()
        return netloc[4:] if netloc.startswith('www.') else netloc
    except Exception:
        return ''

def normalize_site(u: str) -> str:
    if not u.lower().startswith(('http://','https://')):
        u = 'https://' + u.lstrip('/')
    p = urlparse(u)
    host = p.netloc.lower()
    if host.startswith('www.'):
        host = host[4:]
    path = p.path.rstrip('/')
    return host + path

NAME_TRUNC_RE = re.compile(r'^((?:Faculty|Academy)[^\|,\d]+?)(?=\s(?:Trg|Bulevar|Boulevard|Street|Ulica|ul\.|bb|Dr|Bul\.|\d)|$)', re.I)

clean = []
seen = set()
for row in faculties:
    raw_name = row['faculty_name'].strip()
    site = row['website'].strip()
    if site.startswith('mailto:') or site == '#':
        continue
    dom = domain_of(site)
    if not dom:
        continue
    m = NAME_TRUNC_RE.search(raw_name)
    if m:
        name = m.group(1).strip()
    else:
        name = re.split(r'\s{2,}|\||,', raw_name)[0].strip()
    key = (name, normalize_site(site))
    if key in seen:
        continue
    seen.add(key)
    clean.append({'faculty_name': name, 'website': site})

df = pd.DataFrame(clean).reset_index(drop=True)

In [21]:
# Save to CSV
from pathlib import Path

output_path = Path('/home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/university_novisad_faculties.csv')
df.to_csv(output_path, index=False)
print(f'Saved {len(df)} rows to {output_path}')

Saved 11 rows to /home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/university_novisad_faculties.csv


## Extend: Laboratories and Scientific Centers

This section adds two extra extractions:

- Laboratories list: https://www.uns.ac.rs/index.php/en/science/scientific-potentials-of-uns/laboratories
  - Extracts the list items (faculty/institute names) and their links shown in the page list.
- Scientific centers: https://www.uns.ac.rs/index.php/en/science/scientific-potentials-of-uns/scientific-centers
  - For selected faculties (Philosophy, Law, Technical Sciences, Sciences, Academy of Arts, Agriculture, Institute of Food Technology) it opens the dedicated page and extracts each center's name and its website.

Two CSVs will be produced next to this notebook:
- university_novisad_laboratories.csv
- university_novisad_scientific_centers.csv

In [4]:
# Laboratories list extraction
from urllib.parse import urljoin

LABS_URL = "https://www.uns.ac.rs/index.php/en/science/scientific-potentials-of-uns/laboratories"

def is_faculty_or_institute(text: str) -> bool:
    t = (text or '').lower()
    return (
        'faculty of' in t or 'institute of' in t or 'technical faculty' in t or 'academy of' in t or 'biosense' in t
    )


def parse_laboratories(html: str):
    soup = BeautifulSoup(html, 'html.parser')
    results = []
    seen = set()

    # Try to scope around heading 'THE LIST OF LABORATORIES'
    heading = None
    for tag in soup.find_all(['h1','h2','h3','h4']):
        if 'list of laboratories' in tag.get_text(' ', strip=True).lower():
            heading = tag
            break
    container = heading.find_parent(['div','section','article']) if heading else soup

    for a in container.find_all('a', href=True):
        name = a.get_text(' ', strip=True)
        if not is_faculty_or_institute(name):
            continue
        href = a['href']
        if href.startswith('#'):
            continue
        full = urljoin(LABS_URL, href)
        key = (name, full)
        if key in seen:
            continue
        seen.add(key)
        results.append({'entity_name': name, 'website': full})
    return results

labs_html = fetch_html(LABS_URL)
labs = parse_laboratories(labs_html)

labs_df = pd.DataFrame(labs).reset_index(drop=True)

from pathlib import Path as _Path
_labs_out = _Path('/home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/university_novisad_laboratories.csv')
labs_df.to_csv(_labs_out, index=False)
print(f"Saved {len(labs_df)} labs to {_labs_out}")
labs_df.head(10)

Saved 13 labs to /home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/university_novisad_laboratories.csv


Unnamed: 0,entity_name,website
0,Faculty of Philosophy,https://www.uns.ac.rs/index.php/en/science/sci...
1,Faculty of Agriculture,https://www.uns.ac.rs/index.php/en/science/sci...
2,Faculty of Technology,https://www.uns.ac.rs/index.php/en/science/sci...
3,Faculty of Technical Sciences,https://www.uns.ac.rs/index.php/en/science/sci...
4,Faculty of Medicine,https://www.uns.ac.rs/index.php/en/science/sci...
5,Faculty of Sciences,https://www.uns.ac.rs/index.php/en/science/sci...
6,Academy of Arts,https://www.uns.ac.rs/index.php/en/science/sci...
7,Faculty of Civil Engineering,https://www.uns.ac.rs/index.php/en/science/sci...
8,"""Mihajlo Pupin"" Technical Faculty",https://www.uns.ac.rs/index.php/en/science/sci...
9,Faculty of Education,https://www.uns.ac.rs/index.php/en/science/sci...


In [5]:
# Scientific centers extraction (selected faculties)
SCIENTIFIC_CENTERS_URL = "https://www.uns.ac.rs/index.php/en/science/scientific-potentials-of-uns/scientific-centers"
TARGET_FACULTIES = {
    'Faculty of Philosophy',
    'Faculty of Law',
    'Faculty of Technical Sciences',
    'Faculty of Sciences',
    'Academy of Arts',
    'Faculty of Agriculture',
    'Institute of Food Technology',
}


def find_faculty_links(html: str) -> dict:
    soup = BeautifulSoup(html, 'html.parser')
    links = {}
    for a in soup.find_all('a', href=True):
        t = a.get_text(' ', strip=True)
        if t in TARGET_FACULTIES:
            links[t] = urljoin(SCIENTIFIC_CENTERS_URL, a['href'])
    return links


def extract_centers_for_faculty(html: str) -> list[dict]:
    soup = BeautifulSoup(html, 'html.parser')
    out = []

    # Strategy: each center is described in a <p> block starting with "Name of the centre:".
    # We'll iterate paragraphs and capture name + the first URL afterward.
    paras = soup.find_all('p')
    for i, p in enumerate(paras):
        text = p.get_text(' ', strip=True)
        if not text:
            continue
        if 'name of the centre' not in text.lower():
            continue
        # Extract the center name after the label
        name = ''
        m = re.search(r'Name of the centre\s*:\s*([^\n\r]+?)(?:\s{2,}|\s*Scientific field|\s*More information|$)', text, flags=re.I)
        if m:
            name = m.group(1).strip()
        else:
            # Fallback: remove the prefix and take first sentence
            name = re.sub(r'^\s*Name of the centre\s*:\s*', '', text, flags=re.I)
            name = re.split(r'\.|\n|\r|Scientific field|More information', name, flags=re.I)[0].strip()
        # Find website in the same paragraph first
        url = ''
        a = p.find('a', href=True)
        if a and a['href'] and not a['href'].startswith('mailto:'):
            url = a['href']
        else:
            # Search forward until the next center block for a 'More information available at' link
            for j in range(i+1, min(i+6, len(paras))):
                txt = paras[j].get_text(' ', strip=True)
                if 'name of the centre' in (txt or '').lower():
                    break
                aa = paras[j].find('a', href=True)
                if aa and aa['href'] and not aa['href'].startswith('mailto:'):
                    if 'more information' in (txt or '').lower():
                        url = aa['href']
                        break
        out.append({'center_name': name, 'website': url})
    # Fallback: If nothing found, scan strong/b tags
    if not out:
        for strong in soup.find_all(['strong','b']):
            t = strong.get_text(' ', strip=True)
            if 'name of the centre' in (t or '').lower():
                seg = strong.parent.get_text(' ', strip=True)
                m = re.search(r'Name of the centre\s*:\s*([^\n\r]+?)(?:\s{2,}|\s*Scientific field|\s*More information|$)', seg, flags=re.I)
                name = m.group(1).strip() if m else ''
                a = strong.parent.find('a', href=True)
                url = a['href'] if a else ''
                if name:
                    out.append({'center_name': name, 'website': url})
    return out


# Fetch faculty index, then each selected faculty page
centers_index_html = fetch_html(SCIENTIFIC_CENTERS_URL)
faculty_links = find_faculty_links(centers_index_html)

rows = []
for fac, link in faculty_links.items():
    try:
        h = fetch_html(link)
        centers = extract_centers_for_faculty(h)
        for c in centers:
            rows.append({
                'faculty_name': fac,
                'center_name': c['center_name'],
                'website': c['website'],
                'source': link,
            })
    except Exception as e:
        print(f"Failed to parse centers for {fac}: {e}")

centers_df = pd.DataFrame(rows).reset_index(drop=True)
_centers_out = _Path('/home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/university_novisad_scientific_centers.csv')
centers_df.to_csv(_centers_out, index=False)
print(f"Saved {len(centers_df)} centers to {_centers_out}")
centers_df.head(15)

Saved 318 centers to /home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/Novi Sad/university_novisad_scientific_centers.csv


Unnamed: 0,faculty_name,center_name,website,source
0,Faculty of Philosophy,Centre for Languages,http://centarzajezike.ff.uns.ac.rs/,https://www.uns.ac.rs/index.php/en/science/sci...
1,Faculty of Philosophy,Teacher Training Centre,http://www.cun.ff.uns.ac.rs/,https://www.uns.ac.rs/index.php/en/science/sci...
2,Faculty of Philosophy,Centre for Research in Sociology Scientific fi...,http://www.ff.uns.ac.rs/fakultet/fakultet_cent...,https://www.uns.ac.rs/index.php/en/science/sci...
3,Faculty of Philosophy,Centre for Serbian as a Foreign Language Scien...,http://www.srpski-strani.com/,https://www.uns.ac.rs/index.php/en/science/sci...
4,Faculty of Philosophy,Centre for Bihevioral Genetics,http://cbg.ff.uns.ac.rs/sr/,https://www.uns.ac.rs/index.php/en/science/sci...
5,Faculty of Philosophy,Centre for the Hungarian Language,http://www.ff.uns.ac.rs/fakultet/fakultet_cent...,https://www.uns.ac.rs/index.php/en/science/sci...
6,Faculty of Philosophy,Centre for Student Suport Services,,https://www.uns.ac.rs/index.php/en/science/sci...
7,Faculty of Philosophy,Centro Iberoamericano (CIBAM),http://ff.uns.ac.rs/sr/fakultet/o-fakultetu/ce...,https://www.uns.ac.rs/index.php/en/science/sci...
8,Faculty of Philosophy,FOPEC (Faculty of Philosophy Exam Centre) - Ca...,,https://www.uns.ac.rs/index.php/en/science/sci...
9,Faculty of Philosophy,Center for the Studies of Roman Law,http://www.pf.uns.ac.rs/nauka/naucni-rad,https://www.uns.ac.rs/index.php/en/science/sci...
