# Parse pre-standard Irish via standardiser

> "Partial, incomplete"

- categories: [irish, ud, stanza]
- branch: master
- badges: true
- hidden: true

In [1]:
%%capture
!pip install stanza

In [2]:
import urllib.parse, urllib.request, json, sys
import stanza

In [3]:
STD_API = "https://cadhan.com/api/intergaelic/3.0"

def standardise(text: str, lang: str = "ga"):
    """Return a list of (orig_tok, std_tok) pairs from Intergaelic."""
    data   = urllib.parse.urlencode({"foinse": lang, "teacs": text}).encode()
    hdrs   = {"Content-Type": "application/x-www-form-urlencoded",
              "Accept":        "application/json"}
    req    = urllib.request.Request(STD_API, data, hdrs)
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())

In [4]:
stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)

nlp = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    # Let Stanza decide sentences & tokens
    tokenize_pretokenized=True,
    no_ssplit=True,
    verbose=False
)

In [5]:
from itertools import groupby
from typing import List, Tuple

def _split_std(std: str, orig: str) -> List[str]:
    """Return the token(s) that should feed Stanza for this pair."""
    if not std.strip():
        return [orig]
    return std.split()

def _sentences_from_pairs(pairs: List[Tuple[str, str]]):
    """Very light sentence splitter: keep everything up to . ! ?"""
    sent, buf = [], []
    for i, (orig, std) in enumerate(pairs):
        parts = _split_std(std, orig)
        for j, part in enumerate(parts):
            buf.append((i, j, len(parts), orig, part))
            if part in {".", "!", "?"}:
                sent.append(buf);  buf = []
    if buf:
        sent.append(buf)
    return sent

def project_with_stanza(raw_text: str, lang: str = "ga") -> str:
    pairs  = standardise(raw_text, lang)

    sents  = _sentences_from_pairs(pairs)
    pretok = [[m[4] for m in sent] for sent in sents]

    doc = nlp(pretok)

    conllu_lines = []
    for sid, (sent_map, sent_doc) in enumerate(zip(sents, doc.sentences), 1):
        raw_slice = [m[3] for m in sent_map if m[1] == 0]
        std_slice = [m[4] for m in sent_map]
        conllu_lines += [
            f"# sent_id = {sid}",
            f"# text = {' '.join(raw_slice)}",
            f"# text_standard = {' '.join(std_slice)}",
        ]

        # token lines
        widx = 0
        tid  = 1
        for m in sent_map:
            orig_i, sub_i, n_sub, orig_tok, std_tok = m
            word = sent_doc.words[widx]

            if sub_i == 0 and n_sub > 1:
                conllu_lines.append(f"{tid}-{tid+n_sub-1}\t{orig_tok}\t_\t_\t_\t_\t_\t_\t_\t_")

            form = orig_tok if n_sub == 1 else std_tok

            conllu_lines.append("\t".join([
                str(tid),
                form,
                word.lemma or "_",
                word.upos  or "_",
                word.xpos  or "_",
                word.feats or "_",
                str(word.head) if word.head else "_",
                word.deprel or "_",
                "_",
                "_",
            ]))

            widx += 1
            tid  += 1
        conllu_lines.append("")

    return "\n".join(conllu_lines)


In [6]:
nlp_tok = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    tokenize_pretokenized=False,
    verbose=False
)

In [7]:
pp = project_with_stanza("E-, ‘firing range’ a mbíonns acub agus é seo agus é siúd.")

In [13]:
print(pp)

# sent_id = 1
# text = E - , ‘ firing range ’ a mbíonns acub agus é seo agus é siúd .
# text_standard = É - , ‘ firing range ’ a mbíonn acu agus é seo agus é siúd .
1	E	é	PRON	Pers	Gender=Masc|Number=Sing|Person=3	_	root	_	_
2	-	-	PUNCT	.	_	4	punct	_	_
3	,	,	PUNCT	Punct	_	4	punct	_	_
4	‘	‘	X	Foreign	Foreign=Yes	1	parataxis	_	_
5	firing	firing	X	Foreign	Foreign=Yes	4	flat:foreign	_	_
6	range	range	X	Foreign	Foreign=Yes	4	flat:foreign	_	_
7	’	’	PUNCT	Punct	_	9	punct	_	_
8	a	a	PART	Vb	Form=Indirect|PartType=Vb|PronType=Rel	9	mark:prt	_	_
9	mbíonns	bí	VERB	PresImp	Aspect=Hab|Form=Ecl|Mood=Ind|Tense=Pres	4	acl:relcl	_	_
10	acub	ag	ADP	Prep	Number=Plur|Person=3	9	obl:prep	_	_
11	agus	agus	CCONJ	Coord	_	12	cc	_	_
12	é	é	PRON	Pers	Gender=Masc|Number=Sing|Person=3	1	conj	_	_
13	seo	seo	PRON	Dem	PronType=Dem	12	det	_	_
14	agus	agus	SCONJ	Subord	_	15	mark	_	_
15	é	é	PRON	Pers	Gender=Masc|Number=Sing|Person=3	12	conj	_	_
16	siúd	siúd	PRON	Dem	PronType=Dem	15	det	_	_
17	.	.	PUNCT	.	_	1	punct	_	_



In [22]:
raw = "Eh, 'firing range' a mbíonns acub agus é seo agus é siúd."

In [21]:
lines = "{:C}".format(nlp_tok(raw)).split("\n")
print("\n".join(lines))

# text = Eh, 'firing range' a mbíonn acub agus é seo agus é siúd.
# sent_id = 0
1	Eh	Eh	INTJ	Itj	_	0	root	_	start_char=0|end_char=2|SpaceAfter=No
2	,	,	PUNCT	Punct	_	4	punct	_	start_char=2|end_char=3
3	'	'	PUNCT	Punct	_	4	punct	_	start_char=4|end_char=5|SpaceAfter=No
4	firing	firing	X	Foreign	Foreign=Yes	1	parataxis	_	start_char=5|end_char=11
5	range	range	X	Foreign	Foreign=Yes	4	flat:foreign	_	start_char=12|end_char=17|SpaceAfter=No
6	'	'	PUNCT	Punct	_	4	punct	_	start_char=17|end_char=18
7	a	a	PART	Vb	Form=Indirect|PartType=Vb|PronType=Rel	8	mark:prt	_	start_char=19|end_char=20
8	mbíonn	bí	VERB	PresImp	Aspect=Hab|Form=Ecl|Mood=Ind|Tense=Pres	4	csubj:cleft	_	start_char=21|end_char=27
9	acub	ag	NOUN	Noun	Case=Nom|Gender=Masc|Number=Sing	8	nsubj	_	start_char=28|end_char=32
10	agus	agus	CCONJ	Coord	_	11	cc	_	start_char=33|end_char=37
11	é	é	PRON	Pers	Gender=Masc|Number=Sing|Person=3	9	conj	_	start_char=38|end_char=39
12	seo	seo	PRON	Dem	PronType=Dem	11	det	_	start_char=40|end_char=43
13	a

In [24]:
!apt install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.10 [186 kB]
Fetched 186 kB in 0s (436 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126374 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.10_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.10) ...
Setting up poppler-utils (22.02.0-2ubuntu0.10) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
!pip install pdfplumber

In [26]:
!wget https://www.dias.ie/wp-content/uploads/2014/03/track01.pdf

--2025-09-04 13:38:23--  https://www.dias.ie/wp-content/uploads/2014/03/track01.pdf
Resolving www.dias.ie (www.dias.ie)... 160.6.22.11, 2001:770:60:22::11
Connecting to www.dias.ie (www.dias.ie)|160.6.22.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 52925 (52K) [application/pdf]
Saving to: ‘track01.pdf’


2025-09-04 13:38:24 (189 KB/s) - ‘track01.pdf’ saved [52925/52925]



In [76]:
ph = 1262
pw = 892
top = 157
left = 211
bottom = 588 + 16
right = 211 + 497

bbox = (left, top, right, bottom)
print(bbox)

(211, 157, 708, 604)


In [78]:
import pdfplumber

def extract_text_from_poppler_box_scaled(
    pdf_path,
    page_idx,
    left, top, right=None, bottom=None, width=None, height=None,
    pw=None, ph=None  # Poppler page width/height from XML
):
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[page_idx]
        # PDF native dimensions in points
        pdf_w, pdf_h = page.width, page.height

        if pw is None or ph is None:
            raise ValueError("Provide Poppler page width/height (pw, ph) from the XML.")

        # Scale factor Poppler used relative to PDF points
        sx = pw / pdf_w
        sy = ph / pdf_h
        # They should be ~equal; use average to be safe
        s = (sx + sy) / 2.0

        # Normalize Poppler rectangle (top-left origin)
        if right is None or bottom is None:
            if width is None or height is None:
                raise ValueError("Provide either (right,bottom) or (width,height).")
            right = left + width
            bottom = top + height

        # Ensure proper ordering
        x0_pop, x1_pop = sorted((left, right))
        y0_pop, y1_pop = sorted((top, bottom))

        # Descale from Poppler space -> PDF points
        x0_pt = x0_pop / s
        x1_pt = x1_pop / s
        y0_pt = y0_pop / s
        y1_pt = y1_pop / s

        # Convert Poppler top-left coords -> PDF bottom-left coords
        pdf_top    = pdf_h - y0_pt
        pdf_bottom = pdf_h - y1_pt

        bbox = (x0_pt, pdf_bottom, x1_pt, pdf_top)  # (x0, bottom, x1, top)

        # Optional: sanity clamp to page bounds to avoid ValueError
        x0, btm, x1, top_ = bbox
        x0 = max(0, min(x0, pdf_w))
        x1 = max(0, min(x1, pdf_w))
        btm = max(0, min(btm, pdf_h))
        top_ = max(0, min(top_, pdf_h))
        if x1 < x0 or top_ < btm:
            return ""  # empty if fully out of bounds

        bbox = (x0, btm, x1, top_)

        return page.crop(bbox).extract_text() or ""


In [81]:
extract_text_from_poppler_box_scaled("track01.pdf", 1, top=209, left=211, right=708, bottom=588 + 16, ph=1262, pw=892)

''