In [3]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.11.0-py3-none-any.whl.metadata (14 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.11.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.15.0 stanza-1.11.0


In [None]:
from __future__ import annotations

import json
import urllib.parse
import urllib.request
from dataclasses import dataclass
from typing import Dict, List, Optional, Sequence, Tuple

import stanza

STD_API = "https://cadhan.com/api/intergaelic/3.0"

PRON_FEATS: Dict[str, str] = {
    "mé": "Person=1|Number=Sing",
    "tú": "Person=2|Number=Sing",
    "muid": "Person=1|Number=Plur",
    "sinn": "Person=1|Number=Plur",
    "sibh": "Person=2|Number=Plur",
    "siad": "Person=3|Number=Plur",
}
PRON_FORMS = set(PRON_FEATS.keys())

# One-time download is idempotent; keep it here if you run as a script.
stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)

# We supply tokens & sentences. Stanza won't retokenize.
nlp = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    tokenize_pretokenized=True,
    no_ssplit=True,
    verbose=False,
)


def standardise(text: str, lang: str = "ga") -> List[Tuple[str, str]]:
    """Return list of (orig_tok, std_tok) pairs from Intergaelic, with hard invariants."""
    data = urllib.parse.urlencode({"foinse": lang, "teacs": text}).encode()
    hdrs = {"Content-Type": "application/x-www-form-urlencoded", "Accept": "application/json"}
    req = urllib.request.Request(STD_API, data, hdrs)
    with urllib.request.urlopen(req) as resp:
        pairs = [tuple(x) for x in json.loads(resp.read())]

    # Hard invariant: if std splits, it's exactly "<X> <PRON>" where PRON is in PRON_FORMS.
    for i, (orig, std) in enumerate(pairs):
        parts = std.split()
        if len(parts) > 1:
            if len(parts) != 2 or parts[1].lower() not in PRON_FORMS:
                raise ValueError(
                    f"Unexpected multi-token Intergaelic output at index {i}: "
                    f"orig={orig!r}, std={std!r}"
                )

    return pairs


def _feats_to_dict(feats: str) -> Dict[str, str]:
    if not feats or feats == "_":
        return {}
    out: Dict[str, str] = {}
    for part in feats.split("|"):
        if "=" in part:
            k, v = part.split("=", 1)
            out[k] = v
    return out


def _dict_to_feats(d: Dict[str, str]) -> str:
    if not d:
        return "_"
    return "|".join(f"{k}={v}" for k, v in sorted(d.items()))


def _merge_feats_preserve(base: str, add: str) -> str:
    """
    Merge features from `add` into `base` but DO NOT overwrite existing keys
    (especially important for Person/Number if Stanza already provided them).
    """
    bd = _feats_to_dict(base)
    ad = _feats_to_dict(add)
    for k, v in ad.items():
        bd.setdefault(k, v)
    return _dict_to_feats(bd)


@dataclass(frozen=True)
class MapItem:
    orig_i: int       # index in pairs
    sub_i: int        # subtoken index within that orig token
    n_sub: int        # number of subtokens for that orig token
    orig_tok: str     # original surface token
    std_tok: str      # standardized token fed to stanza


def _split_std(std: str, orig: str) -> List[str]:
    """
    For Stanza input:
    - if std is empty/whitespace: fall back to original token
    - else: whitespace split
    """
    if not std.strip():
        return [orig]
    return std.split()


def _sentences_from_pairs(pairs: Sequence[Tuple[str, str]]) -> List[List[MapItem]]:
    """Light sentence splitter on standardized stream: end on . ! ?"""
    sents: List[List[MapItem]] = []
    buf: List[MapItem] = []

    for i, (orig, std) in enumerate(pairs):
        parts = _split_std(std, orig)
        n = len(parts)
        for j, part in enumerate(parts):
            buf.append(MapItem(i, j, n, orig, part))
            if part in {".", "!", "?"}:
                sents.append(buf)
                buf = []

    if buf:
        sents.append(buf)
    return sents


def _choose_representative_word(words, idxs: List[int]) -> int:
    """
    Choose which stanza word represents the original token.
    Since any split is guaranteed to be "<X> <PRON>", we prefer non-PRON.
    """
    for i in idxs:
        if (words[i].upos or "") != "PRON":
            return i
    return idxs[0]


def project_with_stanza(raw_text: str, lang: str = "ga") -> str:
    """
    One CoNLL-U token per ORIGINAL token.
    Intergaelic-injected pronouns never become tokens; they only contribute Person/Number
    to the original token's FEATS (when a split is present).
    """
    pairs = standardise(raw_text, lang)
    sents = _sentences_from_pairs(pairs)

    pretok: List[List[str]] = [[m.std_tok for m in sent] for sent in sents]
    doc = nlp(pretok)

    out: List[str] = []

    for sid, (sent_map, sent_doc) in enumerate(zip(sents, doc.sentences), 1):
        raw_slice = [m.orig_tok for m in sent_map if m.sub_i == 0]
        std_slice = [m.std_tok for m in sent_map]
        out += [
            f"# sent_id = {sid}",
            f"# text = {' '.join(raw_slice)}",
            f"# text_standard = {' '.join(std_slice)}",
        ]

        words = sent_doc.words  # aligned 1:1 with pretok tokens

        # Original token order in this sentence (unique orig_i, in appearance order)
        orig_keys: List[int] = []
        for m in sent_map:
            if not orig_keys or orig_keys[-1] != m.orig_i:
                orig_keys.append(m.orig_i)

        # Map orig_i -> sentence-local CoNLL id (1..N)
        orig_i_to_tid = {orig_i: k + 1 for k, orig_i in enumerate(orig_keys)}

        # Map orig_i -> list of stanza word indices (0-based) that came from it
        orig_i_to_widxs: Dict[int, List[int]] = {orig_i: [] for orig_i in orig_keys}
        for widx, m in enumerate(sent_map):
            orig_i_to_widxs[m.orig_i].append(widx)

        # For head remapping: stanza word index -> original token id
        widx_to_tid: Dict[int, int] = {}
        for orig_i, widxs in orig_i_to_widxs.items():
            tid = orig_i_to_tid[orig_i]
            for widx in widxs:
                widx_to_tid[widx] = tid

        for orig_i in orig_keys:
            tid = orig_i_to_tid[orig_i]
            widxs = orig_i_to_widxs[orig_i]

            rep_widx = _choose_representative_word(words, widxs)
            rep = words[rep_widx]

            # Remap head from stanza-word index space to original-token id space
            if rep.head and rep.head != 0:
                head_widx0 = rep.head - 1
                head_tid = widx_to_tid.get(head_widx0, 0)
            else:
                head_tid = 0

            form = pairs[orig_i][0] or "_"
            feats = rep.feats or "_"

            # If Intergaelic split this token, the 2nd part is guaranteed pronoun by invariant.
            std = pairs[orig_i][1] or ""
            parts = std.split()
            misc = "_"

            if len(parts) == 2:
                inj_pron = parts[1].lower()
                # invariant guarantees it exists in PRON_FEATS
                feats = _merge_feats_preserve(feats, PRON_FEATS[inj_pron])
                misc = f"StdSplit={parts[0]}|{parts[1]}"

            out.append("\t".join([
                str(tid),
                form,
                rep.lemma or "_",
                rep.upos or "_",
                rep.xpos or "_",
                feats,
                str(head_tid),
                rep.deprel or "_",
                "_",
                misc,
            ]))

        out.append("")

    return "\n".join(out)