In [1]:
pip install requests beautifulsoup4



In [2]:
!apt-get update -y
!apt-get install -y tesseract-ocr tesseract-ocr-lit poppler-utils libreoffice
!pip install -q pytesseract pdf2image pdfminer.six PyMuPDF python-docx docx2txt pillow lxml python-slugify chardet

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:3 https://cli.github.com/packages stable InRelease [3,917 B]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,287 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,374 kB]
Get:14

In [3]:
import json, time, re, unicodedata
from datetime import datetime
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

BASE_HOST  = "https://registrai.lt"
LIST_URL   = f"{BASE_HOST}/management/search/search_result/"
LOGIN_URL  = f"{BASE_HOST}/login/do_login"
TIMEOUT    = 30
DELAY_S    = 0.35
MIN_YEAR   = 2015

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36",
}

def nrm(s: str) -> str:
    s = unicodedata.normalize("NFD", s or "")
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

def parse_date(text: str):
    text = (text or "").strip()
    for fmt in ("%Y-%m-%d", "%Y.%m.%d", "%Y/%m/%d", "%d.%m.%Y", "%d-%m-%Y"):
        try:
            return datetime.strptime(text, fmt).date()
        except Exception:
            pass
    m = re.search(r"(19|20)\d{2}[-./](0?[1-9]|1[0-2])[-./](0?[1-9]|[12]\d|3[01])", text)
    if m:
        t = m.group(0).replace(".", "-").replace("/", "-")
        try:
            return datetime.strptime(t, "%Y-%m-%d").date()
        except Exception:
            return None
    return None

def fetch_html(sess: requests.Session, url: str) -> str:
    try:
        response = sess.get(url, headers=HEADERS, timeout=TIMEOUT, allow_redirects=True)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching page content: {e}")
        return None

def guest_accept(sess: requests.Session):
    """Accept the public data license as a guest to unlock the listing."""
    # This mirrors the left form in the HTML: hidden guest=1 + checkbox accept_license
    data = {
        "guest": "1",
        "accept_license": "1",
        "guest_submit": "Duomenų peržiūra"
    }
    r = sess.post(LOGIN_URL, headers=HEADERS, data=data, timeout=TIMEOUT, allow_redirects=True)
    r.raise_for_status()
    time.sleep(DELAY_S)

def find_results_table(soup: BeautifulSoup):
    for tbl in soup.find_all("table"):
        head_tr = tbl.find("tr")
        if not head_tr:
            continue
        headers = [th.get_text(" ", strip=True) for th in head_tr.find_all(["th","td"])]
        H = [nrm(h) for h in headers]
        if any("bendra informacija: pavadinimas" in h for h in H) and \
           any("bendra informacija: iregistravimo data" in h for h in H) and \
           any("bendra informacija: specifikacija" in h for h in H):
            return tbl
    return None

def get_col_indices(table):
    head_tr = table.find("tr")
    headers = [th.get_text(" ", strip=True) for th in head_tr.find_all(["th","td"])]
    title_idx = date_idx = spec_idx = None
    for i, h in enumerate(headers):
        hn = nrm(h)
        if "bendra informacija: pavadinimas" in hn:
            title_idx = i
        elif "bendra informacija: iregistravimo data" in hn:  # Į -> i after normalization
            date_idx = i
        elif "bendra informacija: specifikacija" in hn:
            spec_idx = i
    return title_idx, date_idx, spec_idx

def parse_list_page(html: str, base_for_links: str):
    soup = BeautifulSoup(html, "html.parser")
    table = find_results_table(soup)
    if not table:
        return [], None

    title_idx, date_idx, spec_idx = get_col_indices(table)
    if None in (title_idx, date_idx, spec_idx):
        return [], None

    items = []
    # Work with article row
    for tr in table.select("tr.elements, tr.erow.elements"):
        tds = tr.find_all("td")
        if len(tds) <= max(title_idx, date_idx, spec_idx):
            continue

        # keep only rows with Specifikacija = Taip (green check)
        spec_td = tds[spec_idx]
        has_spec = any(
            (img.get("alt","").strip().lower() == "taip") or ("accept.png" in (img.get("src","") or "").lower())
            for img in spec_td.find_all("img")
        )
        if not has_spec:
            continue

        # title + link
        a = tds[title_idx].find("a", href=True)
        if not a:
            continue
        title = a.get_text(" ", strip=True)
        link = urljoin(base_for_links, a["href"])

        # date
        d = parse_date(tds[date_idx].get_text(" ", strip=True))
        if not d or d.year < MIN_YEAR:
            continue

        items.append({
            "title": title,
            "date": d.isoformat(),
            "link": link
        })

    # pagination → find the link right after the current page
    next_url = None
    pag = soup.find("div", class_="pagination")
    if pag:
        links = pag.find_all("a")
        cur_i = None
        for i, a in enumerate(links):
            if a.get("id") == "pagination_current":
                cur_i = i; break
        if cur_i is not None and cur_i + 1 < len(links):
            href = links[cur_i + 1].get("href")
            if href and href != "#":
                next_url = urljoin(base_for_links, href)
    return items, next_url

def main():
    sess = requests.Session()
    # 1) Accept the license as guest
    try:
        guest_accept(sess)
    except Exception as e:
        print(f"[WARN] Guest accept failed (continuing): {e}")

    # 2) Crawl listing pages
    url = LIST_URL
    results = []
    pages = 0
    while url:
        try:
            html = fetch_html(sess, url)
        except Exception as e:
            print(f"[HTTP] {e} @ {url}")
            break

        rows, next_url = parse_list_page(html, url)
        print(f"[DEBUG] Parsed {len(rows)} rows on this page")
        results.extend(rows)
        url = next_url
        pages += 1
        time.sleep(DELAY_S)

    with open("registrai_output.json", "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"Done. Saved {len(results)} items to registrai_output.json")

if __name__ == "__main__":
    main()


[DEBUG] Parsed 4 rows on this page
[DEBUG] Parsed 11 rows on this page
[DEBUG] Parsed 9 rows on this page
[DEBUG] Parsed 19 rows on this page
[DEBUG] Parsed 16 rows on this page
[DEBUG] Parsed 16 rows on this page
[DEBUG] Parsed 0 rows on this page
[DEBUG] Parsed 0 rows on this page
[DEBUG] Parsed 0 rows on this page
[DEBUG] Parsed 0 rows on this page
[DEBUG] Parsed 0 rows on this page
[DEBUG] Parsed 0 rows on this page
[DEBUG] Parsed 0 rows on this page
[DEBUG] Parsed 0 rows on this page
[DEBUG] Parsed 0 rows on this page
[DEBUG] Parsed 0 rows on this page
[DEBUG] Parsed 0 rows on this page
[DEBUG] Parsed 0 rows on this page
Done. Saved 75 items to registrai_output.json


In [7]:
import shutil
import os

# Path to the downloaded_articles directory
OUTPUT_DIR = "downloaded_articles"

# Check if the directory exists
if os.path.exists(OUTPUT_DIR):
    # Delete the directory and all its contents
    shutil.rmtree(OUTPUT_DIR)
    print(f"Cleared '{OUTPUT_DIR}' directory.")

# Recreate the directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Recreated '{OUTPUT_DIR}' directory.")

Cleared 'downloaded_articles' directory.
Recreated 'downloaded_articles' directory.


In [None]:
# =======================
# Phase 2 — Fetch Specifikacija docs (metadata + original files)
# Output JSON:
# [
#   {
#     "title": "...",
#     "date":  "...",         # from input if present (unchanged)
#     "link":  "...",         # item detail page URL from input
#     "documents": [
#       {"title":"...", "date":"...", "link":"...", "file_format": "..."},
#       ...
#     ]
#   },
#   ...
# ]
# =======================

import os, re, json, time, unicodedata
from urllib.parse import urljoin
from datetime import datetime

import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup

# ---------------- CONFIG ----------------
BASE_HOST  = "https://registrai.lt"
LOGIN_URL  = f"{BASE_HOST}/login/do_login"

IN_JSON    = "registrai_output.json"
OUT_JSON   = "registrai_output_with_docs.json"
OUT_DIR    = "downloaded_articles"   # originals saved here

# Networking
CONNECT_TIMEOUT = 10
READ_TIMEOUT    = 60
TIMEOUT_TUPLE   = (CONNECT_TIMEOUT, READ_TIMEOUT)

RETRIES = Retry(
    total=5, connect=5, read=5,
    backoff_factor=0.8,
    status_forcelist=(429, 500, 502, 503, 504),
    allowed_methods=("GET", "POST"),
    raise_on_status=False,
)

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36",
}

PAUSE_BETWEEN = 0.15

# --------------- UTILS ------------------
def nrm(s: str) -> str:
    s = unicodedata.normalize("NFD", s or "")
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

def safe_filename(name: str) -> str:
    name = re.sub(r"[^\w\s.-]", "_", name, flags=re.UNICODE)
    name = re.sub(r"\s+", "_", name).strip("_")
    return name[:180] or "file"

def parse_date_fuzzy(text: str):
    text = (text or "").strip()
    for fmt in ("%Y-%m-%d", "%Y.%m.%d", "%Y/%m/%d", "%d.%m.%Y", "%d-%m-%Y"):
        try:
            return datetime.strptime(text, fmt).date().isoformat()
        except Exception:
            pass
    m = re.search(r"(19|20)\d{2}[-./](0?[1-9]|1[0-2])[-./](0?[1-9]|[12]\d|3[01])", text)
    if m:
        t = m.group(0).replace(".", "-").replace("/", "-")
        try:
            return datetime.strptime(t, "%Y-%m-%d").date().isoformat()
        except Exception:
            return None
    return None

# ------------- HTTP HELPERS -------------
def make_session():
    s = requests.Session()
    s.headers.update(HEADERS)
    adapter = HTTPAdapter(max_retries=RETRIES, pool_connections=20, pool_maxsize=20)
    s.mount("http://", adapter)
    s.mount("https://", adapter)
    return s

def fetch_html(sess: requests.Session, url: str) -> str:
    r = sess.get(url, timeout=TIMEOUT_TUPLE, allow_redirects=True)
    r.raise_for_status()
    return r.text

def guest_accept(sess: requests.Session):
    data = {"guest":"1","accept_license":"1","guest_submit":"Duomenų peržiūra"}
    r = sess.post(LOGIN_URL, data=data, timeout=TIMEOUT_TUPLE, allow_redirects=True)
    r.raise_for_status()
    time.sleep(0.2)

# ----- Specifikacija tab discovery ------
def find_spec_tab_url(soup: BeautifulSoup, base_url: str):
    """
    Return a URL whose HTML contains the #tab6 (Specifikacija) content.
    """
    a = soup.select_one('a[href="#tab6"], a[aria-controls="tab6"], a[data-target="#tab6"]')
    if a:
        return base_url
    for a in soup.find_all("a", href=True):
        if "specifikacija" in nrm(a.get_text(" ", strip=True)):
            href = a["href"]
            return base_url if href.startswith("#") else urljoin(base_url, href)
    return None

# ---------- Download originals ----------
def guess_ext_from_headers(resp: requests.Response, url: str) -> str:
    cd = resp.headers.get("content-disposition", "")
    m = re.search(r'filename\*=UTF-8\'\'([^;]+)', cd, flags=re.I)
    if not m:
        m = re.search(r'filename="?([^";]+)"?', cd, flags=re.I)
    if m:
        ext = os.path.splitext(m.group(1))[1].lower()
        if ext:
            return ext
    uext = os.path.splitext(url.split("?")[0])[1].lower()
    if uext:
        return uext
    ct = (resp.headers.get("content-type") or "").lower()
    if "pdf" in ct: return ".pdf"
    if "msword" in ct: return ".doc"
    if "officedocument.wordprocessingml.document" in ct: return ".docx"
    if "rtf" in ct: return ".rtf"
    if "html" in ct: return ".html"
    if "asciidoc" in ct or "adoc" in ct: return ".adoc"
    if "plain" in ct: return ".txt"
    return ".bin"

def download_file(sess: requests.Session, url: str, base_folder: str, stub: str) -> str:
    r = sess.get(url, timeout=TIMEOUT_TUPLE, allow_redirects=True, stream=True)
    r.raise_for_status()
    ext = guess_ext_from_headers(r, url)
    os.makedirs(base_folder, exist_ok=True)
    out = os.path.join(base_folder, stub + ext)
    # resume: skip if exists and non-empty
    if os.path.exists(out) and os.path.getsize(out) > 0:
        return out
    with open(out, "wb") as f:
        for chunk in r.iter_content(65536):
            if chunk:
                f.write(chunk)
    return out

# ----- Extract only #tab6 documents -----
def extract_spec_docs(soup_or_html, base_url, sess):
    soup = soup_or_html if isinstance(soup_or_html, BeautifulSoup) else BeautifulSoup(soup_or_html, "html.parser")
    docs = []
    for tbl in soup.select('#tab6 table.details_t'):
        approval_date = None
        title = None
        href = None
        for tr in tbl.select("tr"):
            lbl = tr.find("td", class_="input_label")
            label = lbl.get_text("", strip=True) if lbl else ""
            if "Patvirtinimo data" in label:
                tds = tr.find_all("td")
                if tds:
                    raw = (tds[-1].get_text(" ", strip=True) or "").strip()
                    approval_date = parse_date_fuzzy(raw) or raw or None
            if "Pavadinimas" in label:
                field = tr.find("td", class_="input_field") or (tr.find_all("td")[-1] if tr.find_all("td") else None)
                a = field.find("a", href=True) if field else None
                if a:
                    title = a.get_text(" ", strip=True)
                    href  = urljoin(base_url, a["href"])
                else:
                    title = href = None
        if href:
            file_format = None
            try:
                r = sess.get(href, timeout=TIMEOUT_TUPLE, allow_redirects=True, stream=True)
                r.raise_for_status()
                ext = guess_ext_from_headers(r, href)
                if ext:
                    file_format = ext.lstrip(".").lower()
            except Exception:
                if href.startswith("/"):
                    try:
                        url2 = urljoin(BASE_HOST, href)
                        r = sess.get(url2, timeout=TIMEOUT_TUPLE, allow_redirects=True, stream=True)
                        r.raise_for_status()
                        ext = guess_ext_from_headers(r, url2)
                        if ext:
                            file_format = ext.lstrip(".").lower()
                    except Exception:
                        pass
            finally:
                try:
                    r.close()
                except Exception:
                    pass
            docs.append({"title": title, "date": approval_date, "link": href, "file_format": file_format})

    # de-duplicate by link
    seen, uniq = set(), []
    for d in docs:
        if d["link"] in seen:
            continue
        seen.add(d["link"])
        uniq.append(d)
    return uniq

def process_object(sess: requests.Session, item: dict):
    detail_html = fetch_html(sess, item["link"])
    dsoup = BeautifulSoup(detail_html, "html.parser")

    spec_url = find_spec_tab_url(dsoup, item["link"])
    spec_soup = dsoup
    if spec_url and spec_url != item["link"]:
        spec_html = fetch_html(sess, spec_url)
        spec_soup = BeautifulSoup(spec_html, "html.parser")

    docs = extract_spec_docs(spec_soup, spec_url or item["link"], sess)

    # download originals
    base_folder = os.path.join(OUT_DIR, safe_filename(item.get("title") or "item"))
    os.makedirs(base_folder, exist_ok=True)
    for i, d in enumerate(docs, 1):
        url = d["link"]
        title = d.get("title") or f"document_{i}"
        stub  = f"{safe_filename(title)}"
        try:
            download_file(sess, url, base_folder, stub)
        except Exception:
            if url.startswith("/"):
                try:
                    download_file(sess, urljoin(BASE_HOST, url), base_folder, stub)
                except Exception as e:
                    print(f"   -> download failed: {e}")
            else:
                print(f"   -> download failed: {url}")

    # return only metadata in JSON
    return {
        "title": item.get("title"),
        "date":  item.get("date") or None,
        "link":  item.get("link"),
        "documents": docs
    }

def run_phase2(input_json=IN_JSON, output_json=OUT_JSON):
    with open(input_json, "r", encoding="utf-8") as f:
        rows = json.load(f)

    sess = make_session()
    try:
        guest_accept(sess)
    except Exception as e:
        print(f"[WARN] Guest accept failed (continuing): {e}")

    out = []
    total = len(rows)
    for idx, item in enumerate(rows, 1):
        # refresh session every 20 items
        if idx % 20 == 0:
            try:
                sess.close()
            except Exception:
                pass
            sess = make_session()
            try: guest_accept(sess)
            except Exception: pass

        print(f"[{idx}/{total}] {item.get('title')}")
        try:
            out.append(process_object(sess, item))
        except Exception as e:
            print(f"   -> ERROR: {e}")
            out.append({
                "title": item.get("title"),
                "date":  item.get("date") or None,
                "link":  item.get("link"),
                "documents": []
            })
        time.sleep(PAUSE_BETWEEN)

    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)
    print(f"Done. Saved {len(out)} items to {output_json}")

if __name__ == "__main__":
    run_phase2()


[1/75] Pastatų duomenų banko informacinė sistema
[2/75] Biologinės įvairovės informacinė sistema
[3/75] Teisėjų ir pretendentų į teisėjus informacinė sistema
[4/75] Nuotekų tvarkymo informacinė sistema
[5/75] Integruotų paslaugų platforma
[6/75] Statinių prieinamumo stebėsenos ir kontrolės informacinė sistema
[7/75] Viešosios įstaigos CPO LT Centralizuotų viešųjų pirkimų vykdymo informacinė sistema
[8/75] Žemės ūkio veiklą vykdančių ūkio subjektų patikrinimo aktų informacinė sistema
[9/75] Politinių organizacijų narių registras
[10/75] Nacionalinė turizmo informacinė sistema
[11/75] Paramos žemės ūkio veiklos subjektams apskaičiavimo informacinė sistema
[12/75] Kultūrinės edukacijos informacinė sistema
[13/75] Lietuvos sveikatos priežiūros specialistų kompetencijų platformos informacinė sistema
[14/75] Pieno apskaitos informacinė sistema
[15/75] VšĮ Vilniaus miesto klinikinės ligoninės informacinė sistema
[16/75] Taikomųjų inovacijų tyrimų ir rezultatų informacinė sistema
[17/75] Energ

In [None]:
import json
from collections import Counter

IN_JSON = "registrai_output_with_docs.json"

def normalize(fmt):
    if not isinstance(fmt, str):
        return None
    fmt = fmt.strip().lstrip(".").lower()
    return fmt or None

def main():
    with open(IN_JSON, "r", encoding="utf-8") as f:
        data = json.load(f)

    counter = Counter()
    for item in data:
        for doc in item.get("documents", []):
            fmt = normalize(doc.get("file_format"))
            if fmt:
                counter[fmt] += 1

    distinct_formats = sorted(counter.keys())

    print("Distinct file formats:")
    for fmt in distinct_formats:
        print(f"- {fmt}")

    print("\nCounts:")
    for fmt, cnt in counter.most_common():
        print(f"{fmt}: {cnt}")

if __name__ == "__main__":
    main()


Distinct file formats:
- adoc
- docx
- pdf

Counts:
pdf: 72
docx: 28
adoc: 8


In [15]:
import shutil
import os

# Path to the downloaded_articles directory
OUTPUT_DIR = "downloaded_articles_txt"

# Check if the directory exists
if os.path.exists(OUTPUT_DIR):
    # Delete the directory and all its contents
    shutil.rmtree(OUTPUT_DIR)
    print(f"Cleared '{OUTPUT_DIR}' directory.")

# Recreate the directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Recreated '{OUTPUT_DIR}' directory.")

Cleared 'downloaded_articles_txt' directory.
Recreated 'downloaded_articles_txt' directory.


In [None]:
# =======================
# Phase 3 — Convert originals to TXT and build final JSON (3 batches of 25)
# Batches: items 1-25, 26-50, 51-75
# Writes per-batch JSON checkpoints and a final merged JSON.
# Output JSON:
# [
#   {
#     "title": "...",
#     "date":  "...",         # from input if present (unchanged)
#     "link":  "...",         # item detail page URL from input
#     "documents": [
#       {"title":"...", "date":"...", "text":"..."},
#       ...
#     ]
#   },
#   ...
# ]
# =======================

import os, re, json, glob, mimetypes, tempfile, pathlib, math, gc, io, zipfile
from urllib.parse import urlparse
import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text as pdfminer_extract
from pdf2image import convert_from_bytes
import pytesseract
import docx2txt
import chardet

# ---------- Paths ----------
IN_JSON  = "registrai_output_with_docs.json"
IN_DIR   = "downloaded_articles"       # originals from Step 2
OUT_JSON = "registrai_output_texts.json"
OUT_DIR  = "downloaded_articles_txt"   # TXT output

os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(IN_DIR, exist_ok=True)

# ---------- Config ----------
OCR_LANG        = "eng+lit"   # Lithuanian + English OCR
CONNECT_TIMEOUT = 10
READ_TIMEOUT    = 120
TIMEOUT_TUPLE   = (CONNECT_TIMEOUT, READ_TIMEOUT)

# ---------- Session / cookie reuse (optional for fallback download if you add it later) ----------
def _parse_cookie_string(cookie_str: str):
    jar = {}
    if not cookie_str: return jar
    for part in cookie_str.split(";"):
        if "=" in part:
            k, v = part.strip().split("=", 1)
            jar[k.strip()] = v.strip()
    return jar

def make_session():
    if "CRAWL_SESSION" in globals():
        s = globals()["CRAWL_SESSION"]
    else:
        s = requests.Session()

    retries = Retry(
        total=5, connect=5, read=5,
        backoff_factor=0.6,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET", "HEAD", "OPTIONS"],
        raise_on_status=False,
    )
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))

    default_headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "*/*",
        "Accept-Language": "lt-LT,lt;q=0.9,en-US;q=0.8,en;q=0.7",
        "DNT": "1",
        "Cache-Control": "no-cache",
        "Pragma": "no-cache",
    }
    s.headers.update(default_headers)

    if "CRAWL_HEADERS" in globals() and isinstance(globals()["CRAWL_HEADERS"], dict):
        s.headers.update(globals()["CRAWL_HEADERS"])

    cookies_from_globals = {}
    if "CRAWL_COOKIES" in globals() and isinstance(globals()["CRAWL_COOKIES"], dict):
        cookies_from_globals.update(globals()["CRAWL_COOKIES"])
    if "CRAWL_COOKIES_STR" in globals() and isinstance(globals()["CRAWL_COOKIES_STR"], str):
        cookies_from_globals.update(_parse_cookie_string(globals()["CRAWL_COOKIES_STR"]))
    for k, v in cookies_from_globals.items():
        try: s.cookies.set(k, v)
        except Exception: pass

    return s

# ---------- Name helpers ----------
def safe_dirname(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"[^\w\s.-]", "_", s)
    s = re.sub(r"\s+", "_", s).strip("_")
    return s[:180] or "item"

def safe_filename(name: str) -> str:
    name = re.sub(r"[^\w\s.-]", "_", name, flags=re.UNICODE)
    name = re.sub(r"\s+", "_", name).strip("_")
    return name[:180] or "file"

# ---------- Discovery: local file for a given doc ----------
def find_local_original(base_raw_dir: str, doc_title: str):
    if not os.path.isdir(base_raw_dir):
        return None
    stub = safe_filename(doc_title)
    cand = glob.glob(os.path.join(base_raw_dir, f"{stub}.*"))
    if cand:
        cand.sort(key=lambda p: (os.path.getsize(p), os.path.getmtime(p)), reverse=True)
        return cand[0]
    cand = glob.glob(os.path.join(base_raw_dir, f"{stub}*"))
    if cand:
        cand.sort(key=lambda p: (os.path.getsize(p), os.path.getmtime(p)), reverse=True)
        return cand[0]
    return None


def extract_text_from_pdf_bytes(data: bytes, ocr_lang: str = OCR_LANG):
    """
    Order:
      1) OCR FIRST (streaming, page-by-page via PyMuPDF)
      2) pdftotext (Poppler)
      3) pdfminer
    Returns: (text, method, pages_ocr'ed)
    """
    # --- 1) OCR-first, streaming with PyMuPDF ---
    ocr_text, ocr_pages = "", 0
    try:
        import fitz  # PyMuPDF
        from PIL import Image
        import pytesseract as _pyt
        dpi = 250
        zoom = dpi / 72.0
        mat = fitz.Matrix(zoom, zoom)

        doc = fitz.open(stream=data, filetype="pdf")
        parts = []
        for pno in range(len(doc)):
            try:
                page = doc.load_page(pno)
                # If a page already has a decent text layer, use it
                txt_layer = page.get_text("text") or ""
                if len(txt_layer.strip()) >= 100:
                    parts.append(txt_layer)
                else:
                    pix = page.get_pixmap(matrix=mat, alpha=False)
                    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                    t = _pyt.image_to_string(img, lang=ocr_lang) or ""
                    if not t.strip():
                        t = f"[[UNREADABLE_PAGE_{pno+1}]]"
                    parts.append(t)
                ocr_pages += 1
                if (pno + 1) % 2 == 0:
                    print(f"         ... OCR page {pno+1}/{len(doc)}")
            except Exception:
                parts.append(f"[[UNREADABLE_PAGE_{pno+1}]]")
                ocr_pages += 1
        ocr_text = "\n\n".join(parts)
    except Exception as e:
        # If PyMuPDF/PIL/Tesseract isn't available, fall back to nothing for OCR phase
        ocr_text, ocr_pages = "", 0

    # --- 2) pdftotext (Poppler) ---
    pdftotext_txt = ""
    try:
        import subprocess, shutil, tempfile
        if shutil.which("pdftotext"):
            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tf:
                tf.write(data); tf.flush()
                out = subprocess.check_output(
                    ["pdftotext", "-enc", "UTF-8", "-layout", tf.name, "-"],
                    stderr=subprocess.STDOUT
                )
                pdftotext_txt = out.decode("utf-8", "ignore")
    except Exception:
        pdftotext_txt = ""

    # --- 3) pdfminer ---
    pm_txt = ""
    try:
        from pdfminer.high_level import extract_text as _pdfminer_extract
        with tempfile.NamedTemporaryFile(suffix=".pdf") as tf:
            tf.write(data); tf.flush()
            pm_txt = _pdfminer_extract(tf.name) or ""
    except Exception:
        pm_txt = ""

    # --- Choose the longest result ---
    candidates = [
        ("pdf-ocr-stream", ocr_text),
        ("pdftotext",      pdftotext_txt),
        ("pdfminer",       pm_txt),
    ]
    method, best_text = max(candidates, key=lambda kv: len(kv[1] or ""))

    return best_text, method, ocr_pages


def extract_text_from_docx_bytes(data: bytes):
    with tempfile.TemporaryDirectory() as td:
        p = os.path.join(td, "t.docx")
        with open(p, "wb") as f: f.write(data)
        return (docx2txt.process(p) or "", "docx2txt")

def extract_text_from_doc_bytes(data: bytes):
    with tempfile.TemporaryDirectory() as td:
        inp = os.path.join(td, "t.doc")
        with open(inp, "wb") as f: f.write(data)
        os.system(f'libreoffice --headless --convert-to txt:Text "{inp}" --outdir "{td}" > /dev/null 2>&1')
        txt_path = os.path.join(td, "t.txt")
        if os.path.exists(txt_path):
            with open(txt_path, "r", errors="ignore") as f:
                return f.read(), "libreoffice-doc-txt"
        enc = chardet.detect(data).get("encoding") or "utf-8"
        return data.decode(enc, errors="ignore"), "raw-decode-doc-fallback"

def extract_text_from_html_bytes(data: bytes):
    enc = (chardet.detect(data).get("encoding") or "utf-8").lower()
    html = data.decode(enc, errors="ignore")
    soup = BeautifulSoup(html, "lxml")
    for t in soup(["script","style","noscript","header","footer","nav"]): t.extract()
    text = soup.get_text("\n")
    return re.sub(r"\n{3,}", "\n\n", text).strip(), "html-bs4"

def extract_text_generic_via_libreoffice_bytes(data: bytes, suffix: str):
    with tempfile.TemporaryDirectory() as td:
        inp = os.path.join(td, f"t{suffix or '.bin'}")
        with open(inp, "wb") as f: f.write(data)
        os.system(f'libreoffice --headless --convert-to txt:Text "{inp}" --outdir "{td}" > /dev/null 2>&1')
        txt_path = os.path.join(td, "t.txt")
        if os.path.exists(txt_path):
            with open(txt_path, "r", errors="ignore") as f:
                return f.read(), "libreoffice-generic"
        enc = chardet.detect(data).get("encoding") or "utf-8"
        return data.decode(enc, errors="ignore"), "raw-decode-generic"

def extract_payload_from_adoc_container(data: bytes):
    """
    Lithuanian ADOC-2008 is a ZIP container
    Returns: (payload_bytes, payload_ext, note)
    """
    with zipfile.ZipFile(io.BytesIO(data), "r") as zf:
        names = zf.namelist()
        preferred = [".docx", ".pdf", ".odt", ".rtf", ".doc", ".html", ".htm", ".txt"]
        cands = []
        for n in names:
            if n.endswith("/") or n.startswith(("META-INF/", "metadata/")):
                continue
            ext = os.path.splitext(n)[1].lower()
            try:
                size = zf.getinfo(n).file_size
            except Exception:
                size = 0
            rank = preferred.index(ext) if ext in preferred else len(preferred)
            cands.append((rank, -size, n, ext or ".bin"))
        if not cands:
            return b"", None, "adoc-no-payload"
        cands.sort()
        _, _, chosen, ext = cands[0]
        return zf.read(chosen), ext, f"adoc:{chosen}"

def extract_text_from_adoc_bytes(data: bytes):
    """
    Rare case: a true AsciiDoc *text* mislabeled as ADOC container (.adoc).
    """
    # --- Try asciidoctor -> HTML -> text ---
    try:
        import shutil, subprocess, tempfile
        from bs4 import BeautifulSoup

        if shutil.which("asciidoctor"):
            with tempfile.NamedTemporaryFile(suffix=".adoc", delete=False) as tf:
                tf.write(data)
                tf.flush()
                html = subprocess.check_output(
                    ["asciidoctor", "-b", "html5", "-o", "-", tf.name],
                    stderr=subprocess.STDOUT
                ).decode("utf-8", "ignore")

            soup = BeautifulSoup(html, "lxml")
            for t in soup(["script", "style", "nav", "header", "footer", "noscript"]):
                t.extract()
            text = soup.get_text("\n")
            text = re.sub(r"\n{3,}", "\n\n", text).strip()
            return text, "asciidoctor->html->text"
    except Exception:
        pass

    # --- Fallback: lightweight AsciiDoc markup stripping ---
    enc = (chardet.detect(data).get("encoding") or "utf-8")
    txt = data.decode(enc, errors="ignore")
    txt = re.sub(r'(?m)^\s*:[\w\-\.:]+:.*$', '', txt)
    txt = re.sub(r'\[\[[^\]]+\]\]', '', txt)
    txt = re.sub(r'(?m)^\s*\[#?[A-Za-z0-9_.\-]+\]\s*$', '', txt)
    txt = re.sub(r'<<[^,>]+,\s*([^>]+)>>', r'\1', txt)
    txt = re.sub(r'link:[^\[]+\[([^\]]*)\]', r'\1', txt)
    txt = re.sub(r'image:[^\[]+\[([^\]]*)\]', r'\1', txt)
    txt = re.sub(r'\*([^\*]+)\*', r'\1', txt)
    txt = re.sub(r'_([^_]+)_', r'\1', txt)
    txt = re.sub(r'`([^`]+)`', r'\1', txt)
    txt = re.sub(r'\+\+([^+]+)\+\+', r'\1', txt)
    txt = re.sub(r'#([^#]+)#', r'\1', txt)
    txt = re.sub(r'(?m)^\s*={1,6}\s*(.+?)\s*$', r'\1', txt)
    txt = re.sub(r'(?m)^\s*\[[^\]\n]+\]\s*$', '', txt)
    txt = re.sub(r'(?m)^\s*\|[=\-]{3,}\s*$', '', txt)
    txt = re.sub(r'(?m)^\s*\|\s*', '', txt)
    txt = re.sub(r'\s+\|\s+', '  ', txt)
    txt = re.sub(r'(?ms)^\s*[-\.]{4,}\s*$', '', txt)
    txt = re.sub(r'\n{3,}', '\n\n', txt).strip()
    return txt, "adoc-strip"

def convert_bytes_to_text(data: bytes, ext: str):
    ext_l = (ext or "").lower()
    if ext_l == ".pdf":
        text, method, _pages = extract_text_from_pdf_bytes(data, OCR_LANG)
        return text, method
    if ext_l == ".docx":
        text, method = extract_text_from_docx_bytes(data)
        return text, method
    if ext_l == ".doc":
        text, method = extract_text_from_doc_bytes(data)
        return text, method
    if ext_l in (".html", ".htm"):
        text, method = extract_text_from_html_bytes(data)
        return text, method
    if ext_l == ".txt":
        enc = chardet.detect(data).get("encoding") or "utf-8"
        return data.decode(enc, errors="ignore"), "txt-direct"
    if ext_l == ".adoc":
        # If it looks like ZIP, treat as container and convert the embedded payload
        if data[:4] == b"PK\x03\x04":
            payload, p_ext, note = extract_payload_from_adoc_container(data)
            if p_ext:
                txt, meth = convert_bytes_to_text(payload, p_ext)
                return txt, f"{note}->{meth}"
            enc = chardet.detect(data).get("encoding") or "utf-8"
            return data.decode(enc, errors="ignore"), f"{note}->raw-decode"
        else:
            # True AsciiDoc text
            text, method = extract_text_from_adoc_bytes(data)
            return text, method
    text, method = extract_text_generic_via_libreoffice_bytes(data, ext_l or ".bin")
    return text, method

# ---------- Guess extension ----------
def guess_ext_from_path(path: str):
    return pathlib.Path(path).suffix.lower()

def guess_ext_from_response(content_type: str, url: str) -> str:
    if content_type:
        c = content_type.split(";")[0].strip().lower()
        if c.startswith("application/pdf"): return ".pdf"
        if c.startswith("text/html"): return ".html"
        ext = mimetypes.guess_extension(c)
        if ext: return ext.lower()
    return (pathlib.Path(urlparse(url).path).suffix or ".bin").lower()

def convert_item(item, item_index, total_items):
    sys_title = item.get("title") or f"item_{item_index}"
    sys_link  = item.get("link") or ""
    sys_date  = item.get("date")

    base_txt_dir = os.path.join(OUT_DIR, safe_dirname(sys_title))
    base_raw_dir = os.path.join(IN_DIR,  safe_dirname(sys_title))
    os.makedirs(base_txt_dir, exist_ok=True)

    print(f"   - [{item_index}/{total_items}] {sys_title}")
    out_docs = []

    docs = item.get("documents") or []
    for d in docs:
        dtitle = d.get("title") or "document"
        ddate  = d.get("date")

        stub   = f"{safe_filename(dtitle)}"
        txt_fp = os.path.join(base_txt_dir, stub + ".txt")

        # read local original
        local_path = find_local_original(base_raw_dir, dtitle)

        text, method = "", ""
        if local_path and os.path.isfile(local_path):
            try:
                with open(local_path, "rb") as fbin:
                    data = fbin.read()
                ext = guess_ext_from_path(local_path)
                text, method = convert_bytes_to_text(data, ext)
            except Exception as e:
                method = f"local-convert-error:{e}"
                text = "error text"

        # write TXT
        try:
            with open(txt_fp, "w", encoding="utf-8") as ftxt:
                ftxt.write(text or "")
        except Exception:
            with open(txt_fp, "w", encoding="latin-1", errors="ignore") as ftxt:
                ftxt.write((text or "").encode("latin-1","ignore").decode("latin-1","ignore"))

        out_docs.append({"title": dtitle, "date": ddate, "text": text or "error2 text"})

    return {
        "title": item.get("title"),
        "date":  sys_date,
        "link":  sys_link,
        "documents": out_docs
    }

def run_step3_batched(input_json=IN_JSON, output_json=OUT_JSON, batch_size=25, total_batches=3):
    with open(input_json, "r", encoding="utf-8") as f:
        items = json.load(f)

    # 3 batches of 25 since we have 75 document folders
    batches = []
    start = 0
    for b in range(total_batches):
        end = start + batch_size
        batches.append((start, min(end, len(items))))
        start = end

    sess = make_session()
    all_out = []

    total_items = len(items)
    for bi, (lo, hi) in enumerate(batches, start=1):
        if lo >= hi:
            continue
        print(f"\n=== Batch {bi}/{total_batches}: items {lo+1}–{hi} of {total_items} ===")
        batch_out = []

        for idx in range(lo, hi):
            try:
                rec = convert_item(items[idx], idx+1, total_items)
                batch_out.append(rec)
            except Exception as e:
                print(f"      [WARN] item {idx+1} failed: {e}")
                batch_out.append({
                    "title": items[idx].get("title"),
                    "date":  items[idx].get("date"),
                    "link":  items[idx].get("link"),
                    "documents": []
                })

        # checkpoint per batch
        batch_json_path = f"{os.path.splitext(output_json)[0]}_part{bi}.json"
        with open(batch_json_path, "w", encoding="utf-8") as f:
            json.dump(batch_out, f, ensure_ascii=False, indent=2)
        print(f"   -> Saved batch {bi} checkpoint: {batch_json_path}")

        all_out.extend(batch_out)

        # refresh session between batches
        try: sess.close()
        except Exception: pass
        sess = make_session()
        gc.collect()

    # final merged JSON
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(all_out, f, ensure_ascii=False, indent=2)
    print(f"\nDone. Wrote merged texts and JSON to {output_json}")

if __name__ == "__main__":
    run_step3_batched()



=== Batch 1/3: items 1–25 of 75 ===
   - [1/75] Pastatų duomenų banko informacinė sistema
         ... OCR page 2/29
         ... OCR page 4/29
         ... OCR page 6/29
         ... OCR page 8/29
         ... OCR page 10/29
         ... OCR page 12/29
         ... OCR page 14/29
         ... OCR page 16/29
         ... OCR page 18/29
         ... OCR page 20/29
         ... OCR page 22/29
         ... OCR page 24/29
         ... OCR page 26/29
         ... OCR page 28/29
   - [2/75] Biologinės įvairovės informacinė sistema
         ... OCR page 2/51
         ... OCR page 4/51
         ... OCR page 6/51
         ... OCR page 8/51
         ... OCR page 10/51
         ... OCR page 12/51
         ... OCR page 14/51
         ... OCR page 16/51
         ... OCR page 18/51
         ... OCR page 20/51
         ... OCR page 22/51
         ... OCR page 24/51
         ... OCR page 26/51
         ... OCR page 28/51
         ... OCR page 30/51
         ... OCR page 32/51
         ... OCR page 34



   - [14/75] Pieno apskaitos informacinė sistema
   - [15/75] VšĮ Vilniaus miesto klinikinės ligoninės informacinė sistema
         ... OCR page 2/43
         ... OCR page 4/43
         ... OCR page 6/43
         ... OCR page 8/43
         ... OCR page 10/43
         ... OCR page 12/43
         ... OCR page 14/43
         ... OCR page 16/43
         ... OCR page 18/43
         ... OCR page 20/43
         ... OCR page 22/43
         ... OCR page 24/43
         ... OCR page 26/43
         ... OCR page 28/43
         ... OCR page 30/43
         ... OCR page 32/43
         ... OCR page 34/43
         ... OCR page 36/43
         ... OCR page 38/43
         ... OCR page 40/43
         ... OCR page 42/43
   - [16/75] Taikomųjų inovacijų tyrimų ir rezultatų informacinė sistema
   - [17/75] Energetikos darbuotojų atestavimo informacinė sistema
   - [18/75] Europos pagalbos labiausiai skurstantiems asmenims fondo informacinė sistema
         ... OCR page 2/42
         ... OCR page 4/42
         



   - [25/75] Valstybės informacinių technologijų paslaugų valdymo informacinė sistema
   -> Saved batch 1 checkpoint: registrai_output_texts_part1.json

=== Batch 2/3: items 26–50 of 75 ===
   - [26/75] Metrikacijos ir gyvenamosios vietos deklaravimo informacinė sistema
         ... OCR page 2/61
         ... OCR page 4/61
         ... OCR page 6/61
         ... OCR page 8/61
         ... OCR page 10/61
         ... OCR page 12/61
         ... OCR page 14/61
         ... OCR page 16/61
         ... OCR page 18/61
         ... OCR page 20/61
         ... OCR page 22/61
         ... OCR page 24/61
         ... OCR page 26/61
         ... OCR page 28/61
         ... OCR page 30/61
         ... OCR page 32/61
         ... OCR page 34/61
         ... OCR page 36/61
         ... OCR page 38/61
         ... OCR page 40/61
         ... OCR page 42/61
         ... OCR page 44/61
         ... OCR page 46/61
         ... OCR page 48/61
         ... OCR page 50/61
         ... OCR page 52/61
     



   - [34/75] Viešosios informacijos rengėjų ir skleidėjų informacinė sistema
         ... OCR page 2/61
         ... OCR page 4/61
         ... OCR page 6/61
         ... OCR page 8/61
         ... OCR page 10/61
         ... OCR page 12/61
         ... OCR page 14/61
         ... OCR page 16/61
         ... OCR page 18/61
         ... OCR page 20/61
         ... OCR page 22/61
         ... OCR page 24/61
         ... OCR page 26/61
         ... OCR page 28/61
         ... OCR page 30/61
         ... OCR page 32/61
         ... OCR page 34/61
         ... OCR page 36/61
         ... OCR page 38/61
         ... OCR page 40/61
         ... OCR page 42/61
         ... OCR page 44/61
         ... OCR page 46/61
         ... OCR page 48/61
         ... OCR page 50/61
         ... OCR page 52/61
         ... OCR page 54/61
         ... OCR page 56/61
         ... OCR page 58/61
         ... OCR page 60/61
   - [35/75] Elektroninės pripažinimo erdvės informacinė sistema
   - [36/75] Informaci



   - [41/75] Žvejybos sektoriaus perleidžiamųjų teisių registro informacinė sistema
   - [42/75] Lietuvos erdvinės informacijos portalas
         ... OCR page 2/151
         ... OCR page 4/151
         ... OCR page 6/151
         ... OCR page 8/151
         ... OCR page 10/151
         ... OCR page 12/151
         ... OCR page 14/151
         ... OCR page 16/151
         ... OCR page 18/151
         ... OCR page 20/151
         ... OCR page 22/151
         ... OCR page 24/151
         ... OCR page 26/151
         ... OCR page 28/151
         ... OCR page 30/151
         ... OCR page 32/151
         ... OCR page 34/151
         ... OCR page 36/151
         ... OCR page 38/151
         ... OCR page 40/151
         ... OCR page 42/151
         ... OCR page 44/151
         ... OCR page 46/151
         ... OCR page 48/151
         ... OCR page 50/151
         ... OCR page 52/151
         ... OCR page 54/151
         ... OCR page 56/151
         ... OCR page 58/151
         ... OCR page 60/1



         ... OCR page 2/8
         ... OCR page 4/8
         ... OCR page 6/8
         ... OCR page 8/8
         ... OCR page 2/6
         ... OCR page 4/6
         ... OCR page 6/6
   - [43/75] Lietuvos Respublikos globalinės padėties nustatymo sistemos nuolatinių stočių tinklas
         ... OCR page 2/41
         ... OCR page 4/41
         ... OCR page 6/41
         ... OCR page 8/41
         ... OCR page 10/41
         ... OCR page 12/41
         ... OCR page 14/41
         ... OCR page 16/41
         ... OCR page 18/41
         ... OCR page 20/41
         ... OCR page 22/41
         ... OCR page 24/41
         ... OCR page 26/41
         ... OCR page 28/41
         ... OCR page 30/41
         ... OCR page 32/41
         ... OCR page 34/41
         ... OCR page 36/41
         ... OCR page 38/41
         ... OCR page 40/41
   - [44/75] Lietuvos Respublikos terminų banko informacinė sistema
         ... OCR page 2/29
         ... OCR page 4/29
         ... OCR page 6/29
         ... O



         ... OCR page 2/9
         ... OCR page 4/9
         ... OCR page 6/9
         ... OCR page 8/9




   - [53/75] Aplinkos projektų monitoringo informacinė sistema
         ... OCR page 2/12
         ... OCR page 4/12
         ... OCR page 6/12
         ... OCR page 8/12
         ... OCR page 10/12
         ... OCR page 12/12




   - [54/75] Viešosios įstaigos Visagino pirminės sveikatos priežiūros centro informacinė sistema
   - [55/75] Viešosios įstaigos Naujosios Vilnios poliklinikos informacinė sistema
         ... OCR page 2/2
   - [56/75] Apribojusių savo galimybę lošti asmenų registras
         ... OCR page 2/52
         ... OCR page 4/52
         ... OCR page 6/52
         ... OCR page 8/52
         ... OCR page 10/52
         ... OCR page 12/52
         ... OCR page 14/52
         ... OCR page 16/52
         ... OCR page 18/52
         ... OCR page 20/52
         ... OCR page 22/52
         ... OCR page 24/52
         ... OCR page 26/52
         ... OCR page 28/52
         ... OCR page 30/52
         ... OCR page 32/52
         ... OCR page 34/52
         ... OCR page 36/52
         ... OCR page 38/52
         ... OCR page 40/52
         ... OCR page 42/52
         ... OCR page 44/52
         ... OCR page 46/52
         ... OCR page 48/52
         ... OCR page 50/52
         ... OCR page 52/52
   - [5



   - [68/75] Vaikų sveikatos stebėsenos informacinė sistema
         ... OCR page 2/73
         ... OCR page 4/73
         ... OCR page 6/73
         ... OCR page 8/73
         ... OCR page 10/73
         ... OCR page 12/73
         ... OCR page 14/73
         ... OCR page 16/73
         ... OCR page 18/73
         ... OCR page 20/73
         ... OCR page 22/73
         ... OCR page 24/73
         ... OCR page 26/73
         ... OCR page 28/73
         ... OCR page 30/73
         ... OCR page 32/73
         ... OCR page 34/73
         ... OCR page 36/73
         ... OCR page 38/73
         ... OCR page 40/73
         ... OCR page 42/73
         ... OCR page 44/73
         ... OCR page 46/73
         ... OCR page 48/73
         ... OCR page 50/73
         ... OCR page 52/73
         ... OCR page 54/73
         ... OCR page 56/73
         ... OCR page 58/73
         ... OCR page 60/73
         ... OCR page 62/73
         ... OCR page 64/73
         ... OCR page 66/73
         ... OCR pag



   - [75/75] Nėščiųjų, gimdyvių ir naujagimių sveikatos duomenų tvarkymo informacinė sistema
         ... OCR page 2/36
         ... OCR page 4/36
         ... OCR page 6/36
         ... OCR page 8/36
         ... OCR page 10/36
         ... OCR page 12/36
         ... OCR page 14/36
         ... OCR page 16/36
         ... OCR page 18/36
         ... OCR page 20/36
         ... OCR page 22/36
         ... OCR page 24/36
         ... OCR page 26/36
         ... OCR page 28/36
         ... OCR page 30/36
         ... OCR page 32/36
         ... OCR page 34/36
         ... OCR page 36/36
   -> Saved batch 3 checkpoint: registrai_output_texts_part3.json

Done. Wrote merged texts and JSON to registrai_output_texts.json


In [None]:
import os
import shutil
import glob

# -------- Settings --------
REPO_NAME = "final_information_modeling_individual_task"
REPO_DIR  = REPO_NAME 
FOLDERS   = ["downloaded_articles", "downloaded_articles_txt"]

# 1) Recreate the repo directory
if os.path.exists(REPO_DIR):
    shutil.rmtree(REPO_DIR)
    print(f"Cleared existing '{REPO_DIR}' directory.")
os.makedirs(REPO_DIR, exist_ok=True)
print(f"Created '{REPO_DIR}' directory.")

# 2) Copy the folders
for folder in FOLDERS:
    if os.path.isdir(folder):
        dest = os.path.join(REPO_DIR, os.path.basename(folder))
        print(f"Copying folder: {folder} -> {dest}")
        shutil.copytree(folder, dest)
    else:
        print(f"Skipping (not found): {folder}")

# 3) Copy all JSON files
json_files = glob.glob("*.json")
if json_files:
    for jf in json_files:
        dest = os.path.join(REPO_DIR, os.path.basename(jf))
        print(f"Copying JSON: {jf} -> {dest}")
        shutil.copy2(jf, dest)
else:
    print("No .json files found.")

# 4) Zip the repo folder (no auto-download)
zip_base = os.path.abspath(REPO_DIR)
zip_path = shutil.make_archive(zip_base, "zip", root_dir=".", base_dir=REPO_DIR)

print("\nAll set!")
print(f"Repo folder: {os.path.abspath(REPO_DIR)}")
print(f"Zip archive: {zip_path}")


Created 'final_information_modeling_individual_task' directory.
Copying folder: downloaded_articles -> final_information_modeling_individual_task/downloaded_articles
Copying folder: downloaded_articles_txt -> final_information_modeling_individual_task/downloaded_articles_txt
Copying JSON: registrai_output_with_docs.json -> final_information_modeling_individual_task/registrai_output_with_docs.json
Copying JSON: registrai_output_texts_part2.json -> final_information_modeling_individual_task/registrai_output_texts_part2.json
Copying JSON: registrai_output_texts_part3.json -> final_information_modeling_individual_task/registrai_output_texts_part3.json
Copying JSON: registrai_output.json -> final_information_modeling_individual_task/registrai_output.json
Copying JSON: registrai_output_texts.json -> final_information_modeling_individual_task/registrai_output_texts.json
Copying JSON: registrai_output_texts_part1.json -> final_information_modeling_individual_task/registrai_output_texts_part1.j