In [3]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.11.0-py3-none-any.whl.metadata (14 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.11.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.15.0 stanza-1.11.0


In [11]:
from __future__ import annotations

import json
import urllib.parse
import urllib.request
from dataclasses import dataclass
from typing import Dict, List, Sequence, Tuple

import stanza

STD_API = "https://cadhan.com/api/intergaelic/3.0"

PRON_FEATS: Dict[str, str] = {
    "mé": "Person=1|Number=Sing",
    "tú": "Person=2|Number=Sing",
    "muid": "Person=1|Number=Plur",
    "sinn": "Person=1|Number=Plur",
    "sibh": "Person=2|Number=Plur",
    "siad": "Person=3|Number=Plur",
}
PRON_FORMS = set(PRON_FEATS.keys())

# Download once (idempotent)
stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)

# We supply tokens + sentence breaks
nlp = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    tokenize_pretokenized=True,
    no_ssplit=True,
    verbose=False,
)


def standardise(text: str, lang: str = "ga") -> List[Tuple[str, str]]:
    """Intergaelic pairs with a strict check ONLY for 1→2 splits."""
    data = urllib.parse.urlencode({"foinse": lang, "teacs": text}).encode()
    hdrs = {"Content-Type": "application/x-www-form-urlencoded", "Accept": "application/json"}
    req = urllib.request.Request(STD_API, data, hdrs)
    with urllib.request.urlopen(req) as resp:
        pairs = [tuple(x) for x in json.loads(resp.read())]

    # If (orig is 1 token) and (std is 2 tokens), then std[1] MUST be one of our pronouns.
    # Anything else (2→2, 2→1, 3→2, etc.) is allowed.
    for i, (orig, std) in enumerate(pairs):
        o = orig.split()
        s = (std or "").split()
        if len(o) == 1 and len(s) == 2 and s[1].lower() not in PRON_FORMS:
            raise ValueError(
                f"Unexpected 1→2 Intergaelic output at index {i}: orig={orig!r}, std={std!r}"
            )

    return pairs


def compute_spaceafter(raw_text: str, orig_tokens: List[str]) -> List[bool]:
    """
    True  => there WAS whitespace after this token in raw_text (no SpaceAfter=No needed)
    False => no whitespace after (emit SpaceAfter=No)
    Aligns monotically by substring search; raises if alignment fails.
    """
    flags: List[bool] = []
    pos = 0
    n = len(raw_text)

    for i, tok in enumerate(orig_tokens):
        # Skip whitespace before token
        while pos < n and raw_text[pos].isspace():
            pos += 1

        # Exact at current position preferred
        if raw_text.startswith(tok, pos):
            start = pos
        else:
            start = raw_text.find(tok, pos)
            if start == -1:
                raise ValueError(f"Could not align token {i} {tok!r} near pos {pos}")
        end = start + len(tok)
        pos = end

        if pos >= n:
            flags.append(True)  # end-of-text
        else:
            flags.append(raw_text[pos].isspace())

    return flags


def feats_to_dict(feats: str) -> Dict[str, str]:
    if not feats or feats == "_":
        return {}
    out: Dict[str, str] = {}
    for part in feats.split("|"):
        if "=" in part:
            k, v = part.split("=", 1)
            out[k] = v
    return out


def dict_to_feats(d: Dict[str, str]) -> str:
    if not d:
        return "_"
    return "|".join(f"{k}={v}" for k, v in sorted(d.items()))


def merge_feats_preserve(base: str, add: str) -> str:
    """Merge without overwriting existing keys."""
    bd = feats_to_dict(base)
    ad = feats_to_dict(add)
    for k, v in ad.items():
        bd.setdefault(k, v)
    return dict_to_feats(bd)


def merge_misc(*items: str) -> str:
    parts: List[str] = []
    for it in items:
        if it and it != "_":
            parts.append(it)
    return "_" if not parts else "|".join(parts)


@dataclass(frozen=True)
class MapItem:
    orig_i: int
    sub_i: int
    n_sub: int
    orig_tok: str
    std_tok: str


def split_std(std: str, orig: str) -> List[str]:
    if not (std or "").strip():
        return [orig]
    return (std or "").split()


def sentences_from_pairs(pairs: Sequence[Tuple[str, str]]) -> List[List[MapItem]]:
    """End a sentence when standardized token is . ! ?"""
    sents: List[List[MapItem]] = []
    buf: List[MapItem] = []
    for i, (orig, std) in enumerate(pairs):
        parts = split_std(std, orig)
        n = len(parts)
        for j, part in enumerate(parts):
            buf.append(MapItem(i, j, n, orig, part))
            if part in {".", "!", "?"}:
                sents.append(buf)
                buf = []
    if buf:
        sents.append(buf)
    return sents


def choose_rep_word(words, idxs: List[int]) -> int:
    """
    Representative word for lemma/POS/feats:
    prefer non-PRON when there is a 1→2 split (X PRON).
    """
    for i in idxs:
        if (words[i].upos or "") != "PRON":
            return i
    return idxs[0]


def project_with_stanza(raw_text: str, lang: str = "ga") -> str:
    pairs = standardise(raw_text, lang)
    orig_tokens = [o for (o, _) in pairs]
    spaceafter = compute_spaceafter(raw_text, orig_tokens)

    sents = sentences_from_pairs(pairs)
    pretok: List[List[str]] = [[m.std_tok for m in sent] for sent in sents]
    doc = nlp(pretok)

    out: List[str] = []

    for sid, (sent_map, sent_doc) in enumerate(zip(sents, doc.sentences), 1):
        raw_slice = [m.orig_tok for m in sent_map if m.sub_i == 0]
        std_slice = [m.std_tok for m in sent_map]
        out += [
            f"# sent_id = {sid}",
            f"# text = {' '.join(raw_slice)}",
            f"# text_standard = {' '.join(std_slice)}",
        ]

        words = sent_doc.words

        # Unique orig indices in this sentence, in order
        orig_keys: List[int] = []
        for m in sent_map:
            if not orig_keys or orig_keys[-1] != m.orig_i:
                orig_keys.append(m.orig_i)

        # orig_i -> sentence-local token id
        orig_i_to_tid = {orig_i: k + 1 for k, orig_i in enumerate(orig_keys)}

        # orig_i -> stanza word indices in this sentence
        orig_i_to_widxs: Dict[int, List[int]] = {orig_i: [] for orig_i in orig_keys}
        for widx, m in enumerate(sent_map):
            orig_i_to_widxs[m.orig_i].append(widx)

        # stanza word index -> orig token id (for head remap)
        widx_to_tid: Dict[int, int] = {}
        for orig_i, widxs in orig_i_to_widxs.items():
            tid = orig_i_to_tid[orig_i]
            for widx in widxs:
                widx_to_tid[widx] = tid

        # Emit one CoNLL-U token per orig_i
        for orig_i in orig_keys:
            tid = orig_i_to_tid[orig_i]
            widxs = orig_i_to_widxs[orig_i]

            rep_widx = choose_rep_word(words, widxs)
            rep = words[rep_widx]

            # Head remap: stanza is 1-based, 0=root
            if rep.head and rep.head != 0:
                head_widx0 = rep.head - 1
                head_tid = widx_to_tid.get(head_widx0, 0)
            else:
                head_tid = 0

            form = pairs[orig_i][0] or "_"
            feats = rep.feats or "_"

            misc_parts: List[str] = []
            if not spaceafter[orig_i]:
                misc_parts.append("SpaceAfter=No")

            # Inject Person/Number only for 1→2 splits (orig single token, std two tokens)
            orig_parts = (pairs[orig_i][0] or "").split()
            std_parts = (pairs[orig_i][1] or "").split()
            if len(orig_parts) == 1 and len(std_parts) == 2:
                inj_pron = std_parts[1].lower()
                # guaranteed by standardise() invariant
                feats = merge_feats_preserve(feats, PRON_FEATS[inj_pron])
                misc_parts.append(f"StdSplit={std_parts[0]}|{std_parts[1]}")

            misc = merge_misc(*misc_parts)

            out.append("\t".join([
                str(tid),
                form,
                rep.lemma or "_",
                rep.upos or "_",
                rep.xpos or "_",
                feats,
                str(head_tid),
                rep.deprel or "_",
                "_",
                misc,
            ]))

        out.append("")

    return "\n".join(out)

In [12]:
f = project_with_stanza('Do leanadar ag "seasamh a gcirt" go dtí gur dhein Eoghan Rua Ó Néill, ag an mBeinn mBorb, gníomh díreach de shaghas an ghnímh a dhein driotháir a athar agus Aodh Rua Ó Dónaill ag Béal an Átha Buí deich mbliana agus daichead roimis sin.')

In [13]:
print(f)

# sent_id = 1
# text = Do leanadar ag " seasamh a gcirt " go dtí gur dhein Eoghan Rua Ó Néill , ag an mBeinn mBorb , gníomh díreach de shaghas an ghnímh a dhein driotháir a athar agus Aodh Rua Ó Dónaill ag Béal an Átha Buí deich mbliana agus daichead roimis sin .
# text_standard = Do leanadar ag " seasamh a gcirt " go dtí go ndearna Eoghan Rua Ó Néill , ag an mBinn mBorb , gníomh díreach de shaghas an ghnímh a rinne deartháir a athar agus Aodh Rua Ó Dónaill ag Béal an Átha Buí deich mbliana agus daichead roimhe sin .
1	Do	do	PART	Vb	PartType=Vb	2	mark:prt	_	_
2	leanadar	lean	VERB	VTI	Mood=Ind|Number=Plur|Person=1|Tense=Past	0	root	_	_
3	ag	ag	ADP	Simp	_	5	case	_	_
4	"	"	PUNCT	Punct	_	5	punct	_	SpaceAfter=No
5	seasamh	seasamh	NOUN	Noun	VerbForm=Inf	2	xcomp	_	_
6	a	a	DET	Det	Number=Plur|Person=3|Poss=Yes	7	nmod:poss	_	_
7	gcirt	ceirt	NOUN	Noun	Case=Gen|Definite=Def|Form=Ecl|Gender=Fem|Number=Sing	5	nmod	_	SpaceAfter=No
8	"	"	PUNCT	Punct	_	5	punct	_	_
9	go	go	ADP	Cmpd	PrepForm=Cmpd	11	mar