In [3]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.11.0-py3-none-any.whl.metadata (14 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.11.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.15.0 stanza-1.11.0


In [14]:
from dataclasses import dataclass
from typing import List, Tuple, Optional

PRON_FEATS = {
    "mé":   "Person=1|Number=Sing",
    "tú":   "Person=2|Number=Sing",
    "muid": "Person=1|Number=Plur",
    "sinn": "Person=1|Number=Plur",
    "sibh": "Person=2|Number=Plur",
    "siad": "Person=3|Number=Plur",
}
PRON_FORMS = set(PRON_FEATS)

@dataclass(frozen=True)
class AlignTok:
    orig_tok: str
    std_tok: str
    inj_pron: Optional[str] = None  # only set on the LAST orig token in the unit

def expand_and_align(pairs: List[Tuple[str, str]]) -> List[AlignTok]:
    """
    Expand Intergaelic rewrite units to a flat, strictly aligned token stream.

    Allowed:
      1) N→N mapping (orig_parts and std_parts same length): align positionally
      2) Injection mapping where std has exactly ONE extra token and it's a known pronoun:
         len(std)=len(orig)+1 and std[-1] in PRON_FORMS.
         Align the shared prefix positionally, and attach inj_pron to the LAST aligned token.

    Everything else: raise.
    """
    out: List[AlignTok] = []

    for i, (orig, std) in enumerate(pairs):
        orig_parts = (orig or "").split()
        std_parts  = (std  or "").split()

        if len(orig_parts) == len(std_parts):
            out.extend(AlignTok(o, s) for o, s in zip(orig_parts, std_parts))
            continue

        if len(std_parts) == len(orig_parts) + 1 and std_parts[-1].lower() in PRON_FORMS:
            inj = std_parts[-1].lower()
            shared = std_parts[:-1]
            # shared prefix must match orig length
            if len(shared) != len(orig_parts):
                raise ValueError("Internal alignment error (this should be impossible).")

            for j, (o, s) in enumerate(zip(orig_parts, shared)):
                out.append(AlignTok(o, s, inj_pron=inj if j == len(orig_parts) - 1 else None))
            continue

        raise ValueError(
            f"Unsupported Intergaelic mapping at pair index {i}: "
            f"orig={orig!r} ({len(orig_parts)} toks) std={std!r} ({len(std_parts)} toks)"
        )

    return out


In [15]:
f = project_with_stanza('Do leanadar ag "seasamh a gcirt" go dtí gur dhein Eoghan Rua Ó Néill, ag an mBeinn mBorb, gníomh díreach de shaghas an ghnímh a dhein driotháir a athar agus Aodh Rua Ó Dónaill ag Béal an Átha Buí deich mbliana agus daichead roimis sin.')

In [16]:
print(f)

# sent_id = 1
# text = Do leanadar ag " seasamh a gcirt " go dtí gur dhein Eoghan Rua Ó Néill , ag an mBeinn mBorb , gníomh díreach de shaghas an ghnímh a dhein driotháir a athar agus Aodh Rua Ó Dónaill ag Béal an Átha Buí deich mbliana agus daichead roimis sin .
# text_standard = Do leanadar ag " seasamh a gcirt " go dtí go ndearna Eoghan Rua Ó Néill , ag an mBinn mBorb , gníomh díreach de shaghas an ghnímh a rinne deartháir a athar agus Aodh Rua Ó Dónaill ag Béal an Átha Buí deich mbliana agus daichead roimhe sin .
1	Do	do	PART	Vb	PartType=Vb	2	mark:prt	_	_
2	leanadar	lean	VERB	VTI	Mood=Ind|Number=Plur|Person=1|Tense=Past	0	root	_	_
3	ag	ag	ADP	Simp	_	5	case	_	_
4	"	"	PUNCT	Punct	_	5	punct	_	SpaceAfter=No
5	seasamh	seasamh	NOUN	Noun	VerbForm=Inf	2	xcomp	_	_
6	a	a	DET	Det	Number=Plur|Person=3|Poss=Yes	7	nmod:poss	_	_
7	gcirt	ceirt	NOUN	Noun	Case=Gen|Definite=Def|Form=Ecl|Gender=Fem|Number=Sing	5	nmod	_	SpaceAfter=No
8	"	"	PUNCT	Punct	_	5	punct	_	_
9	go	go	ADP	Cmpd	PrepForm=Cmpd	11	mar