In [3]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.11.0-py3-none-any.whl.metadata (14 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.11.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.15.0 stanza-1.11.0


In [17]:
from __future__ import annotations

import json
import urllib.parse
import urllib.request
from dataclasses import dataclass
from typing import Dict, List, Optional, Sequence, Tuple

import stanza

STD_API = "https://cadhan.com/api/intergaelic/3.0"

# Injected pronouns (only these)
PRON_FEATS: Dict[str, str] = {
    "mé": "Person=1|Number=Sing",
    "tú": "Person=2|Number=Sing",
    "muid": "Person=1|Number=Plur",
    "sinn": "Person=1|Number=Plur",
    "sibh": "Person=2|Number=Plur",
    "siad": "Person=3|Number=Plur",
}
PRON_FORMS = set(PRON_FEATS.keys())

# Stanza setup
stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)
nlp = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    tokenize_pretokenized=True,
    no_ssplit=True,
    verbose=False,
)


def standardise(text: str, lang: str = "ga") -> List[Tuple[str, str]]:
    """Return list of (orig_chunk, std_chunk) rewrite units from Intergaelic."""
    data = urllib.parse.urlencode({"foinse": lang, "teacs": text}).encode()
    hdrs = {"Content-Type": "application/x-www-form-urlencoded", "Accept": "application/json"}
    req = urllib.request.Request(STD_API, data, hdrs)
    with urllib.request.urlopen(req) as resp:
        return [tuple(x) for x in json.loads(resp.read())]


@dataclass(frozen=True)
class Tok:
    """
    A single OUTPUT token (orig_tok) aligned to a single STANZA token (std_tok).
    If Intergaelic injected a pronoun (std has +1 token and last is PRON), it is stored
    on the LAST token of the rewrite unit (inj_pron).
    """
    orig_tok: str
    std_tok: str
    inj_pron: Optional[str] = None


def expand_and_align(pairs: Sequence[Tuple[str, str]]) -> List[Tok]:
    """
    Expand Intergaelic rewrite units into a flat token stream with strict alignment.

    Allowed:
      1) N→N by whitespace splitting: align positionally.
      2) Injection: len(std_parts) == len(orig_parts) + 1 AND std_parts[-1] in PRON_FORMS.
         Align shared prefix positionally; attach inj_pron to the LAST aligned token.

    Everything else: raise (no guessing).
    """
    out: List[Tok] = []

    for i, (orig_chunk, std_chunk) in enumerate(pairs):
        orig_parts = (orig_chunk or "").split()
        std_parts = (std_chunk or "").split()

        # Treat empty std as identity on the orig side (rare, but your earlier code did this)
        if not std_parts and orig_parts:
            out.extend(Tok(o, o) for o in orig_parts)
            continue

        if len(orig_parts) == len(std_parts):
            out.extend(Tok(o, s) for o, s in zip(orig_parts, std_parts))
            continue

        if len(std_parts) == len(orig_parts) + 1 and std_parts[-1].lower() in PRON_FORMS:
            inj = std_parts[-1].lower()
            shared = std_parts[:-1]
            if len(shared) != len(orig_parts):
                raise ValueError(
                    f"Internal alignment error at pair {i}: orig={orig_chunk!r} std={std_chunk!r}"
                )
            for j, (o, s) in enumerate(zip(orig_parts, shared)):
                out.append(Tok(o, s, inj_pron=inj if j == len(orig_parts) - 1 else None))
            continue

        raise ValueError(
            f"Unsupported Intergaelic mapping at pair index {i}: "
            f"orig={orig_chunk!r} ({len(orig_parts)} toks) std={std_chunk!r} ({len(std_parts)} toks)"
        )

    return out


def compute_spaceafter(raw_text: str, orig_tokens: List[str]) -> List[bool]:
    """
    True  => there WAS whitespace after this token in raw_text (so no SpaceAfter=No)
    False => no whitespace after (emit SpaceAfter=No)

    Monotonic substring alignment; raises if a token can't be located.
    """
    flags: List[bool] = []
    pos = 0
    n = len(raw_text)

    for i, tok in enumerate(orig_tokens):
        # Skip whitespace before token
        while pos < n and raw_text[pos].isspace():
            pos += 1

        # Prefer exact match at current position
        if raw_text.startswith(tok, pos):
            start = pos
        else:
            start = raw_text.find(tok, pos)
            if start == -1:
                raise ValueError(f"Could not align token {i} {tok!r} near pos {pos}")
        end = start + len(tok)
        pos = end

        if pos >= n:
            flags.append(True)  # end-of-text
        else:
            flags.append(raw_text[pos].isspace())

    return flags


def sentences_from_tokens(tokens: Sequence[Tok]) -> List[List[int]]:
    """
    Sentence segmentation over the STANZA token stream:
    end sentence at . ! ? on std_tok.
    Returns sentences as lists of indices into `tokens`.
    """
    sents: List[List[int]] = []
    buf: List[int] = []
    for i, t in enumerate(tokens):
        buf.append(i)
        if t.std_tok in {".", "!", "?"}:
            sents.append(buf)
            buf = []
    if buf:
        sents.append(buf)
    return sents


def feats_to_dict(feats: str) -> Dict[str, str]:
    if not feats or feats == "_":
        return {}
    out: Dict[str, str] = {}
    for part in feats.split("|"):
        if "=" in part:
            k, v = part.split("=", 1)
            out[k] = v
    return out


def dict_to_feats(d: Dict[str, str]) -> str:
    if not d:
        return "_"
    return "|".join(f"{k}={v}" for k, v in sorted(d.items()))


def merge_feats_preserve(base: str, add: str) -> str:
    """
    Merge without overwriting existing keys (so we don't stomp on Stanza if it already
    provided Person/Number).
    """
    bd = feats_to_dict(base)
    ad = feats_to_dict(add)
    for k, v in ad.items():
        bd.setdefault(k, v)
    return dict_to_feats(bd)


def merge_misc(*items: str) -> str:
    parts: List[str] = []
    for it in items:
        if it and it != "_":
            parts.append(it)
    return "_" if not parts else "|".join(parts)


def choose_rep_word(words, idxs: List[int]) -> int:
    """
    Representative word for lemma/POS/feats/deprel/head among a group.
    We only group when we *decide* to later; here each Tok is 1:1 with a stanza word.
    Keep this for possible future extension; currently idxs will be length 1.
    """
    for i in idxs:
        if (words[i].upos or "") != "PRON":
            return i
    return idxs[0]


def project_with_stanza(raw_text: str, lang: str = "ga") -> str:
    """
    Outputs CoNLL-U:
      - Tokenization = aligned original tokens (after expanding orig chunks by whitespace)
      - Stanza run on aligned standardized tokens
      - Injected pronoun (Intergaelic-only) contributes Person/Number to the LAST token
        in the rewrite unit (Tok.inj_pron), never creates a token.
      - SpaceAfter=No derived from raw_text spacing.
    """
    pairs = standardise(raw_text, lang)
    toks = expand_and_align(pairs)

    orig_tokens = [t.orig_tok for t in toks]
    spaceafter = compute_spaceafter(raw_text, orig_tokens)

    sents = sentences_from_tokens(toks)
    pretok: List[List[str]] = [[toks[i].std_tok for i in sent] for sent in sents]
    doc = nlp(pretok)

    out: List[str] = []
    global_idx = 0  # index into toks

    for sid, (sent_idxs, sent_doc) in enumerate(zip(sents, doc.sentences), 1):
        raw_slice = [toks[i].orig_tok for i in sent_idxs]
        std_slice = [toks[i].std_tok for i in sent_idxs]

        out += [
            f"# sent_id = {sid}",
            f"# text = {' '.join(raw_slice)}",
            f"# text_standard = {' '.join(std_slice)}",
        ]

        words = sent_doc.words  # 1 per pretokenized token in this sentence

        # Map stanza word index (sentence-local) -> output token id (sentence-local)
        # Here it's 1:1 by construction.
        for widx, tok_i in enumerate(sent_idxs):
            tid = widx + 1
            t = toks[tok_i]
            w = words[widx]

            # Head remap: stanza head is 1-based within this sentence; 0=root
            head_tid = w.head if (w.head is not None and w.head != 0) else 0

            feats = w.feats or "_"
            misc_parts: List[str] = []

            if not spaceafter[tok_i]:
                misc_parts.append("SpaceAfter=No")

            if t.inj_pron is not None:
                # Guaranteed to be in PRON_FEATS by expand_and_align()
                feats = merge_feats_preserve(feats, PRON_FEATS[t.inj_pron])
                misc_parts.append(f"InjPron={t.inj_pron}")

            misc = merge_misc(*misc_parts)

            out.append("\t".join([
                str(tid),
                t.orig_tok or "_",
                w.lemma or "_",
                w.upos or "_",
                w.xpos or "_",
                feats,
                str(head_tid),
                w.deprel or "_",
                "_",
                misc,
            ]))

            global_idx += 1

        out.append("")

    return "\n".join(out)


In [18]:
f = project_with_stanza('Do leanadar ag "seasamh a gcirt" go dtí gur dhein Eoghan Rua Ó Néill, ag an mBeinn mBorb, gníomh díreach de shaghas an ghnímh a dhein driotháir a athar agus Aodh Rua Ó Dónaill ag Béal an Átha Buí deich mbliana agus daichead roimis sin.')

In [19]:
print(f)

# sent_id = 1
# text = Do leanadar ag " seasamh a gcirt " go dtí gur dhein Eoghan Rua Ó Néill , ag an mBeinn mBorb , gníomh díreach de shaghas an ghnímh a dhein driotháir a athar agus Aodh Rua Ó Dónaill ag Béal an Átha Buí deich mbliana agus daichead roimis sin .
# text_standard = Do leanadar ag " seasamh a gcirt " go dtí go ndearna Eoghan Rua Ó Néill , ag an mBinn mBorb , gníomh díreach de shaghas an ghnímh a rinne deartháir a athar agus Aodh Rua Ó Dónaill ag Béal an Átha Buí deich mbliana agus daichead roimhe sin .
1	Do	do	PART	Vb	PartType=Vb	2	mark:prt	_	_
2	leanadar	lean	VERB	VTI	Mood=Ind|Number=Plur|Person=1|Tense=Past	0	root	_	_
3	ag	ag	ADP	Simp	_	5	case	_	_
4	"	"	PUNCT	Punct	_	5	punct	_	SpaceAfter=No
5	seasamh	seasamh	NOUN	Noun	VerbForm=Inf	2	xcomp	_	_
6	a	a	DET	Det	Number=Plur|Person=3|Poss=Yes	7	nmod:poss	_	_
7	gcirt	ceirt	NOUN	Noun	Case=Gen|Definite=Def|Form=Ecl|Gender=Fem|Number=Sing	5	nmod	_	SpaceAfter=No
8	"	"	PUNCT	Punct	_	5	punct	_	_
9	go	go	ADP	Cmpd	PrepForm=Cmpd	12	mar