In [1]:
%%capture
!pip install stanza

: 

In [4]:
import urllib.parse, urllib.request, json, sys
import stanza

In [1]:
STD_API = "https://cadhan.com/api/intergaelic/3.0"

def standardise(text: str, lang: str = "ga"):
    """Return a list of (orig_tok, std_tok) pairs from Intergaelic."""
    data   = urllib.parse.urlencode({"foinse": lang, "teacs": text}).encode()
    hdrs   = {"Content-Type": "application/x-www-form-urlencoded",
              "Accept":        "application/json"}
    req    = urllib.request.Request(STD_API, data, hdrs)
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())

In [3]:
import stanza
stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)

nlp = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    # Let Stanza decide sentences & tokens
    tokenize_pretokenized=True,
    no_ssplit=True,
    verbose=False
)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Users/joregan/opt/anaconda3/envs/stanza/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Users/joregan/opt/anaconda3/envs/stanza/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/joregan/opt/anaconda3/envs/stanza/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/joregan/opt/anaconda3/envs/stanza/lib/python3.10/site-packages/traitlets/config/

In [5]:
# Cell ▸ robust projection with multi‑word support
# -----------------------------------------------
from itertools import groupby
from typing import List, Tuple

def _split_std(std: str, orig: str) -> List[str]:
    """Return the token(s) that should feed Stanza for this pair."""
    if not std.strip():                         # e.g. Do → ""  → keep 'Do'
        return [orig]
    return std.split()                          # may yield 1‑N tokens

def _sentences_from_pairs(pairs: List[Tuple[str, str]]):
    """Very light sentence splitter: keep everything up to . ! ?"""
    sent, buf = [], []
    for i, (orig, std) in enumerate(pairs):
        parts = _split_std(std, orig)
        for j, part in enumerate(parts):
            buf.append((i, j, len(parts), orig, part))  # mapping entry
            if part in {".", "!", "?"}:
                sent.append(buf);  buf = []
    if buf:
        sent.append(buf)
    return sent                                       # [[mapping …], …]

def project_with_stanza(raw_text: str, lang: str = "ga") -> str:
    # 1 ── standardise -------------------------------------------------------
    pairs  = standardise(raw_text, lang)             # [(orig, std), …]

    # 2 ── build *pre‑tokenised* input & a mapping table ---------------------
    sents  = _sentences_from_pairs(pairs)            # list‑of‑sentences
    pretok = [[m[4] for m in sent] for sent in sents]  # token strings only

    # 3 ── parse with Stanza *pretok* mode -----------------------------------
    doc = nlp(pretok)                                # same shape as `sents`

    # 4 ── project back, keeping multi‑word tokens ---------------------------
    conllu_lines = []
    for sid, (sent_map, sent_doc) in enumerate(zip(sents, doc.sentences), 1):
        # comment lines
        raw_slice = [m[3]           for m in sent_map if m[1] == 0]     # first sub‑token per orig
        std_slice = [m[4]           for m in sent_map]                  # every sub‑token
        conllu_lines += [
            f"# sent_id = {sid}",
            f"# text = {' '.join(raw_slice)}",
            f"# text_standard = {' '.join(std_slice)}",
        ]

        # token lines
        widx = 0                                     # index in sent_doc.words
        tid  = 1                                     # running token ID in CONLL‑U
        for m in sent_map:
            orig_i, sub_i, n_sub, orig_tok, std_tok = m
            word = sent_doc.words[widx]

            if sub_i == 0 and n_sub > 1:             # multi‑word‑token header
                conllu_lines.append(f"{tid}-{tid+n_sub-1}\t{orig_tok}\t_\t_\t_\t_\t_\t_\t_\t_")

            # choose FORM for the sub‑token
            form = orig_tok if n_sub == 1 else std_tok

            conllu_lines.append("\t".join([
                str(tid),
                form,
                word.lemma or "_",
                word.upos  or "_",
                word.xpos  or "_",
                word.feats or "_",
                str(word.head) if word.head else "_",
                word.deprel or "_",
                "_",
                "_",
            ]))

            widx += 1
            tid  += 1
        conllu_lines.append("")                      # blank line between sents

    return "\n".join(conllu_lines)


In [3]:
def run_parse(raw_text):
    pairs      = standardise(raw_text, "ga")
    sentences  = naive_sentences(pairs)
    doc        = parse_standardised(sentences)
    conllu_out = project(doc, sentences)
    print(conllu_out)


In [8]:
print(project_with_stanza("Páirc Uí Chaoimh"))

# sent_id = 1
# text = Páirc Uí Chaoimh
# text_standard = Páirc Uí Chaoimh
1	Páirc	páirc	NOUN	Noun	Case=Nom|Definite=Def|Gender=Fem|Number=Sing	_	root	_	_
2	Uí	uí	PART	Pat	PartType=Pat	1	nmod	_	_
3	Chaoimh	caoimh	PROPN	Noun	Case=Gen|Definite=Def|Form=Len|Gender=Masc|Number=Sing	1	flat:name	_	_



In [21]:
print(project_with_stanza("do ól sé"))

# sent_id = 1
# text = do ól sé
# text_standard = do ól sé
1	do	do	PART	Vb	PartType=Vb	2	mark:prt	_	_
2	ól	ól	VERB	VTI	Mood=Ind|Tense=Past	_	root	_	_
3	sé	sé	PRON	Pers	Gender=Masc|Number=Sing|Person=3	2	nsubj	_	_



In [73]:
print(project_with_stanza("Chuir sé ann a bhéil é."))

# sent_id = 1
# text = Chuir sé ann a bhéil é .
# text_standard = Chuir sé ann a bhéil é .
1	Chuir	cuir	VERB	VTI	Form=Len|Mood=Ind|Tense=Past	_	root	_	_
2	sé	sé	PRON	Pers	Gender=Masc|Number=Sing|Person=3	1	nsubj	_	_
3	ann	i	ADP	Prep	Gender=Masc|Number=Sing|Person=3	1	xcomp:pred	_	_
4	a	a	DET	Det	Gender=Masc|Number=Sing|Person=3|Poss=Yes	5	nmod:poss	_	_
5	bhéil	béil	NOUN	Noun	Case=Nom|Definite=Def|Form=Len|Gender=Fem|Number=Sing	1	obj	_	_
6	é	é	PRON	Pers	Gender=Masc|Number=Sing|Person=3	1	obj	_	_
7	.	.	PUNCT	.	_	1	punct	_	_



In [74]:
print(project_with_stanza("Chuir sé in uachtar"))

# sent_id = 1
# text = Chuir sé in uachtar
# text_standard = Chuir sé in uachtar
1	Chuir	cuir	VERB	VTI	Form=Len|Mood=Ind|Tense=Past	_	root	_	_
2	sé	sé	PRON	Pers	Gender=Masc|Number=Sing|Person=3	1	nsubj	_	_
3	in	i	ADP	Simp	_	4	case	_	_
4	uachtar	uachtar	NOUN	Noun	Case=Nom|Gender=Masc|Number=Sing	1	obl	_	_



In [17]:
run_parse("D'fhan sé")

NameError: name 'run_parse' is not defined

In [11]:
sample = "Do bhíodh longphort ag na Lochlannaigh anseo. D’éirigh an t‑árd‑rí."
print(project_with_stanza("Bu leis Baile ui Mún, áit fiche bó agus tarbh leobhtha."))


# sent_id = 1
# text = Bu leis Baile ui Mún , áit fiche bó agus tarbh leobhtha .
# text_standard = Ba leis Baile uí Mún , áit fiche bó agus tarbh leo .
1	Bu	is	AUX	Cop	Tense=Past|VerbForm=Cop	3	cop	_	_
2	leis	le	ADP	Simp	_	3	case	_	_
3	Baile	Baile	PROPN	Noun	Definite=Def|Gender=Masc|Number=Sing	_	root	_	_
4	ui	uí	PART	Pat	PartType=Pat	3	flat:name	_	_
5	Mún	Mún	PROPN	Noun	Case=Gen|Definite=Def|Gender=Masc|Number=Sing	4	flat:name	_	_
6	,	,	PUNCT	Punct	_	7	punct	_	_
7	áit	áit	NOUN	Noun	Case=Nom|Gender=Fem|Number=Sing	3	appos	_	_
8	fiche	fiche	NUM	Num	NumType=Card	9	nummod	_	_
9	bó	bó	NOUN	Noun	Case=Nom|Gender=Fem|Number=Sing	7	nmod	_	_
10	agus	agus	CCONJ	Coord	_	11	cc	_	_
11	tarbh	tarbh	NOUN	Noun	Case=Nom|Gender=Masc|Number=Sing	3	conj	_	_
12	leobhtha	le	ADP	Prep	Number=Plur|Person=3	11	obl:prep	_	_
13	.	.	PUNCT	.	_	3	punct	_	_



In [5]:
nlp_ga = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    # Let Stanza decide sentences & tokens
    tokenize_pretokenized=False,
    verbose=False
)

In [6]:
PAGE = """Maidin amháin, chuaigh gamal as Gúra amach chun
connadh a bhailiú. Shiúil sé leis thar an machaire gur tháinig
sé go dtí crann mór ológ cois abhann.

Dhreap sé an crann agus shuigh sé faoi ar an ngéag ba mhó ar
an gcrann. Nuair a bhraith sé ar a shocracht ar an ngéag,
thosaigh sé á bualadh le tua.

Ghabh an sagart thar bráid. D'fhéach sé in airde agus labhair
leis an ngamal:

"A dheartháirín ó, cad atá ar bun agat?" ar sé os ard. "Ní mar
sin a ghearrtar adhmad!"

"Níl aon bhealach eile ann, a Athair," arsa an gamal.
"Caithfear an tua a ardú agus é a bhualadh anuas ar an
adhmad!"

"Ach tá tú i do shuí ar an ngéag atá á gearradh agat! Brisfidh
an ghéag, agus nuair a bhrisfidh titfidh tusa go talamh agus
marófar thú,” arsa an sagart.
"""

In [8]:
!pip install numpy



In [9]:
nlp_ga(PAGE)

RuntimeError: Numpy is not available