#### Setup

In [None]:
import os
import re
import json
from datetime import datetime
from pathlib import Path
from typing import Any, Optional

import pandas as pd
import fitz  # PyMuPDF


#### Funkcije za procesiranje, normalizaciju podataka

In [None]:
# setup direktorija koji se koriste
PROJECT_DIR = Path(".").resolve()
PDF_DIR = PROJECT_DIR / "pdfs"
OUT_DIR = PROJECT_DIR / "out"
OUT_DIR.mkdir(exist_ok=True)

CACHE_DIR = OUT_DIR / "cache"
CACHE_DIR.mkdir(exist_ok=True)

pdf_paths = sorted(PDF_DIR.glob("*.pdf"))
print("Broj PDF-ova:", len(pdf_paths))

# pomocne funkcije za normaliziranje teksta 
def normalize_text(s: Any) -> str:
    """Osigurava pretvaranje inputa u normalizirani string."""
    if s is None:
        return ""
    if not isinstance(s, str):
        s = str(s)
    # zamjena ligatura i uklanjanje suvisnih razmaka i novih redaka
    s = s.replace("ﬁ", "fi").replace("ﬂ", "fl").replace("ﬀ", "ff").replace("ﬃ", "ffi").replace("ﬄ", "ffl")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def remove_reference_noise(text: str) -> str:
    """Uklanja URL-ove i web adrese iz teksta (šum u referencama)."""
    text = re.sub(r"http[s]?://\S+", "", text)
    text = re.sub(r"www\.\S+", "", text)
    return normalize_text(text)


Broj PDF-ova: 110


#### Procesiranje autora i naslova

In [None]:
# globalni markeri(bitne rijeci) za zaustavljane 
_STOP_MARKERS = [
    "abstract", "introduction", "keywords", "key words",
    "article", "open", "received", "accepted", "published",
    "correspondence", "email", "doi", "journal", "npj",
]
_SMALL_NAME_WORDS = {"van", "von", "de", "der", "da", "di", "del", "la", "le", "du", "of", "and"}

def looks_like_affiliation(line: str) -> bool:
    """Provjerava je li linija vjerojatno afiliacija (sadrži ključne riječi institucija)."""
    return bool(re.search(r"\b(University|Department|Institute|Laboratory|Centre|Center|Present address|Address)\b", 
                          line, flags=re.IGNORECASE))

def _looks_like_body_text(line: str) -> bool:
    """Heuristika za prepoznavanje početka glavnog teksta (normalnih rečenica) radi prekida extrakcije autora."""
    if not line:
        return False
    low = line.lower()
    # ako linija sadrži ključne riječi (markere) => tretiraj kao body
    if any(marker in low for marker in _STOP_MARKERS):
        return True
    # ako linija izgleda kao normalna recenica (mnogi mali znakovi + točka ili zarez) => body
    letters = [c for c in line if c.isalpha()]
    lower_ratio = sum(c.islower() for c in letters) / max(1, len(letters)) if letters else 0
    if lower_ratio > 0.55 and (("." in line) or ("," in line)) and len(line) > 80:
        return True
    return False

def extract_title_from_page(first_page_text: str) -> str:
    """Izvlači naslov članka sa prve stranice."""
    if not first_page_text:
        return ""
    lines = [ln.strip() for ln in first_page_text.splitlines() if ln.strip()]
    # preskoci linije "ARTICLE" / "OPEN" ako postoje
    start_idx = 0
    for k, ln in enumerate(lines[:80]):
        up = ln.upper()
        if up == "ARTICLE" or up == "OPEN":
            start_idx = k + 1
    title_lines = []
    for ln in lines[start_idx:start_idx + 30]:
        low = ln.lower()
        # prekini ako dodemo do meta podataka ili afiliacija
        if looks_like_affiliation(ln) or low.startswith("npj ") or low.startswith("published") or low.startswith("doi") or low.startswith("received") or low.startswith("accepted"):
            break
        if "@" in ln:
            break
        # ako je linija prepoznata kao linija autora i vec imamo dio naslova, prekini (naslov je zavrsen)
        if (re.search(r"\d", ln) or " and " in low or "," in ln) and title_lines:
            # dodatna provjera: ako linija sadrzi tipične rijeci recenice (npr. "we", "have"), vjerojatno je to dio sazetka, ne lista autora
            if not re.search(r"\b(we|have|has|using|show|are|is)\b", low):
                break
        # dodaj liniju u naslov
        title_lines.append(re.sub(r"\s+", " ", ln).strip())
        if len(title_lines) >= 5:
            break
    title = " ".join(title_lines).strip()
    # ocisti potencijalno zalijepljeno ime autora na kraju naslova
    m = re.match(r"^(.*?)(?:\s+[A-Z][a-zA-Z.\-’']+\s+[A-Z][a-zA-Z.\-’']+)$", title)
    if m and len(title) > 40:
        title = m.group(1).strip()
    return title

def extract_authors_from_page(first_page_text: str, title: str) -> str:
    """Izvlači imena autora sa prve stranice (ispod naslova), robustno za višelinijske popise autora."""
    lines = [re.sub(r"\s+", " ", ln).strip() for ln in first_page_text.splitlines()]
    lines = [ln for ln in lines if ln]
    if not lines:
        return ""
    # pronadi pocetak nakon naslova
    start_idx = 0
    title_clean = re.sub(r"\s+", " ", title or "").strip()
    if title_clean:
        for idx, ln in enumerate(lines[:80]):
            if title_clean.lower() in ln.lower():
                start_idx = idx + 1
                break
    author_lines = []
    for ln in lines[start_idx:start_idx + 12]:
        low = ln.lower().strip()
        # preskoci ako je oznaka clanka ili slican marker
        if low == "article" or low == "open":
            continue
        # prekini na afiliacijama ili meta podacima (casopis, doi, itd.)
        if looks_like_affiliation(ln) or low.startswith("npj ") or low.startswith("doi"):
            break
        if "introduction" in low or low.startswith("abstract"):
            break
        if _looks_like_body_text(ln):
            break
        # Obrada linija s autorima
        if not author_lines:
            # prva linija s autorima treba imati naznake (brojeve, zareze, "and", ✉)
            if re.search(r"\d", ln) or "," in ln or " and " in low or "✉" in ln:
                if not ln.rstrip().endswith(".") and len(re.findall(r"\b[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ’'\-]+\b", ln)) >= 2:
                    author_lines.append(ln)
        else:
            # dodaj sljedeće linije autora ako ne izgledaju kao kraj
            if ln.lower().startswith("and "):
                author_lines.append(ln)
            elif not ln.rstrip().endswith("."):
                author_lines.append(ln)
            else:
                break
    raw_authors = " ".join(author_lines).strip()
    if not raw_authors:
        return ""
    # ukloni email adrese i footnote znakove/brojeve
    raw_authors = re.sub(r"\S+@\S+", " ", raw_authors)
    raw_authors = raw_authors.replace("✉", " ")
    raw_authors = raw_authors.replace("†", " ").replace("‡", " ").replace("*", " ")
    raw_authors = re.sub(r"\d+", " ", raw_authors)
    raw_authors = re.sub(r"\s+", " ", raw_authors).strip()
    # podijeli autore po zarezima, ";" ili " and "
    parts = re.split(r",|;|\band\b", raw_authors)
    parts = [p.strip() for p in parts if p.strip()]
    # ocisti svaki segment imena i filtriraj nepodobne
    cleaned_authors = []
    for tok in parts:
        # ukloni potencijalne ostatke footnota i visak razmaka
        tok = re.sub(r"[(){}\[\]]", "", tok).strip()
        if len(tok) > 80 or len(tok.split()) < 2:
            continue
        # provjeri da li se svaki dio imena ispravno piše (pocetno slovo veliko osim dozvoljenih malih rijeci)
        words = tok.split()
        alpha_words = [w for w in words if any(ch.isalpha() for ch in w)]
        if not alpha_words:
            continue
        bad = 0
        for w in alpha_words:
            w_clean = re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ’'\-]", "", w)
            if not w_clean:
                continue
            if w_clean.lower() in _SMALL_NAME_WORDS:
                continue
            if not w_clean[0].isupper():
                bad += 1
        if bad >= 2:
            continue
        cleaned_authors.append(tok)
    # zadrzi unikatna imena redom pojavljivanja, najvise 25 autora
    seen = set()
    unique_authors = []
    for name in cleaned_authors:
        key = name.lower()
        if key not in seen:
            seen.add(key)
            unique_authors.append(name)
    return "; ".join(unique_authors[:25])


#### Batch procesiranje svih pdf-ova u /pdfs folder

In [None]:
import sys

sys.setrecursionlimit(10000)

# fallback
if "choose_page_limits" not in globals():
    def choose_page_limits(pdf_path: Path) -> tuple[int, int]:
        meta_pages = 2
        size_mb = pdf_path.stat().st_size / (1024 * 1024)
        if size_mb >= 25:
            content_pages = 8
        elif size_mb >= 10:
            content_pages = 12
        else:
            content_pages = 15
        return meta_pages, content_pages

if "guess_year" not in globals():
    def guess_year(text: str) -> str:
        m = re.search(r"\(\s*(19\d{2}|20\d{2})\s*\)", text)
        if m:
            y = m.group(1)
            window = text[max(0, m.start()-80):m.end()+80].lower()
            if "npj" in window or "©" in window or "author" in window or "published" in window:
                return y

        m = re.search(r"©.*\b(19\d{2}|20\d{2})\b", text)
        if m:
            return m.group(1)

        m = re.search(r"(Received|Accepted|Published)[^\n]*\b(19\d{2}|20\d{2})\b", text, flags=re.IGNORECASE)
        if m:
            return m.group(2)

        return ""

# brza ekstrakcija
def extract_pages_text_fast(pdf_path: Path, max_pages: int) -> list[str]:
    out = []
    doc = fitz.open(str(pdf_path))
    limit = min(doc.page_count, max_pages)
    for i in range(limit):
        out.append(normalize_text(doc.load_page(i).get_text("text")))
    doc.close()
    return out

records = []
total = len(pdf_paths)

for i, pdf_path in enumerate(pdf_paths, start=1):
    try:
        meta_pages, content_pages = choose_page_limits(pdf_path)

        # meta podaci
        meta_pages_text = extract_pages_text_fast(pdf_path, max_pages=meta_pages)
        first_page_text = meta_pages_text[0] if meta_pages_text else ""
        head_text = "\n\n".join(meta_pages_text)
        text_head = remove_reference_noise(head_text)

        # content kolona
        content_pages_text = extract_pages_text_fast(pdf_path, max_pages=content_pages)
        content_text = remove_reference_noise("\n\n".join([t for t in content_pages_text if t]))

        # broj stranica kolona
        with fitz.open(str(pdf_path)) as doc:
            num_pages = doc.page_count

        # title + authors + year kolone
        title = extract_title_from_page(first_page_text)
        authors_raw = extract_authors_from_page(text_head, title)  # may be ""
        year = guess_year(head_text)

        # abstract + content razdvojeno
        abstract_text = ""
        content_main = content_text
        m_intro = re.search(r"\nINTRODUCTION\b", content_text, flags=re.IGNORECASE)
        if m_intro:
            intro_index = m_intro.start()
            abstract_text = content_text[:intro_index].strip()
            abstract_text = re.sub(r"(?im)^(ARTICLE|OPEN)\s*$", "", abstract_text)
            abstract_text = re.sub(r"(?im)^npj.*$", "", abstract_text)
            abstract_text = re.sub(r"(?im)^Published.*$", "", abstract_text)
            abstract_text = re.sub(r"\n{2,}", "\n", abstract_text).strip()
            content_main = content_text[intro_index:].strip()

        if abstract_text and len(abstract_text) < 200:
            abstract_text = ""

        has_tables = bool(re.search(r"\b(Table|Tablica)\b", content_text, flags=re.IGNORECASE))

        issues = []
        if not title:
            issues.append("Nije pronađen naslov.")
        if len(content_text) < 3000:
            issues.append("Malo izvučenog teksta (mogući skenirani PDF).")
        if "�" in content_text:
            issues.append("Neispravni znakovi (enkodiranje).")
        issues.append(f"Sadržaj izvučen iz prvih {content_pages} stranica radi performansi.")

        # uzimanje dijela cijelog sadrzaja (prvih 4000 znakova) kao 'content' kolona
        content_snippet = content_main.strip()
        if len(content_snippet) > 4000:
            content_snippet = content_snippet[:4000] + "\n\n[TRUNCATED]"

        records.append({
            "file_name": pdf_path.name,
            "title": title,
            "authors_raw": authors_raw,
            "year": year,
            "abstract": abstract_text,
            "content": content_snippet,
            "num_pages": num_pages,
            "has_tables": has_tables,
            "extraction_issues": " ".join(issues),
        })

        if i % 10 == 0:
            print(f"Obrađeno {i}/{total}")

    except Exception as e:
        records.append({
            "file_name": pdf_path.name,
            "title": "",
            "authors_raw": "",
            "year": "",
            "abstract": "",
            "content": "",
            "num_pages": 0,
            "has_tables": False,
            "extraction_issues": f"Greška pri obradi PDF-a: {type(e).__name__}: {e}",
        })
        print(f"[ERROR] {pdf_path.name}: {type(e).__name__}: {e}")


Obrađeno 10/110
Obrađeno 20/110
Obrađeno 30/110
Obrađeno 40/110
Obrađeno 50/110
Obrađeno 60/110
Obrađeno 70/110
Obrađeno 80/110
Obrađeno 90/110
Obrađeno 100/110
Obrađeno 110/110


#### Export u JSON, CSV, XML, Parquet

In [None]:
OUT_DIR = Path("out")
OUT_DIR.mkdir(exist_ok=True)

df = pd.DataFrame(records).copy()

# --- year se pretvara iz 2020.0 u 2020 ---
if "year" in df.columns:
    df["year"] = df["year"].fillna("").astype(str).str.replace(r"\.0$", "", regex=True)

# --- 1) CSV (samo authors_raw) ---
df.to_csv(OUT_DIR / "results.csv", index=False, encoding="utf-8")

# --- 2) JSON (authors lista, bez authors_raw) ---
df_json = df.copy()
if "authors_raw" in df_json.columns:
    df_json["authors"] = df_json["authors_raw"].fillna("").map(
        lambda s: [a.strip() for a in str(s).split(";") if a.strip()]
    )
    df_json = df_json.drop(columns=["authors_raw"])
else:
    df_json["authors"] = [[] for _ in range(len(df_json))]

with open(OUT_DIR / "results.json", "w", encoding="utf-8") as f:
    json.dump(df_json.to_dict(orient="records"), f, ensure_ascii=False, indent=2)

# --- 3) XML (očisti kontrolne znakove) ---
def safe_xml_cell(x):
    s = "" if x is None else str(x)
    return "".join(ch for ch in s if ch in "\t\n\r" or ord(ch) >= 32)

df_xml = df.copy()
df_xml = df_xml.apply(lambda col: col.map(safe_xml_cell))

df_xml.to_xml(
    OUT_DIR / "results.xml",
    index=False,
    root_name="papers",
    row_name="paper",
    parser="etree",
    xml_declaration=True,
    encoding="utf-8",
)

# --- 4) Parquet ---
df.to_parquet(OUT_DIR / "results.parquet", index=False)

print("Spremljeno u out/: results.csv, results.json, results.xml, results.parquet")


Spremljeno u out/: results.csv, results.json, results.xml, results.parquet
