In [1]:
%%capture
!pip install stanza

In [2]:
import urllib.parse, urllib.request, json, sys
import stanza

In [3]:
STD_API = "https://cadhan.com/api/intergaelic/3.0"

def standardise(text: str, lang: str = "ga"):
    """Return a list of (orig_tok, std_tok) pairs from Intergaelic."""
    data   = urllib.parse.urlencode({"foinse": lang, "teacs": text}).encode()
    hdrs   = {"Content-Type": "application/x-www-form-urlencoded",
              "Accept":        "application/json"}
    req    = urllib.request.Request(STD_API, data, hdrs)
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())

In [4]:
stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)

nlp = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    # Let Stanza decide sentences & tokens
    tokenize_pretokenized=True,
    no_ssplit=True,
    verbose=False
)

In [5]:
# Cell ▸ robust projection with multi‑word support
# -----------------------------------------------
from itertools import groupby
from typing import List, Tuple

def _split_std(std: str, orig: str) -> List[str]:
    """Return the token(s) that should feed Stanza for this pair."""
    if not std.strip():                         # e.g. Do → ""  → keep 'Do'
        return [orig]
    return std.split()                          # may yield 1‑N tokens

def _sentences_from_pairs(pairs: List[Tuple[str, str]]):
    """Very light sentence splitter: keep everything up to . ! ?"""
    sent, buf = [], []
    for i, (orig, std) in enumerate(pairs):
        parts = _split_std(std, orig)
        for j, part in enumerate(parts):
            buf.append((i, j, len(parts), orig, part))  # mapping entry
            if part in {".", "!", "?"}:
                sent.append(buf);  buf = []
    if buf:
        sent.append(buf)
    return sent                                       # [[mapping …], …]

def project_with_stanza(raw_text: str, lang: str = "ga") -> str:
    # 1 ── standardise -------------------------------------------------------
    pairs  = standardise(raw_text, lang)             # [(orig, std), …]

    # 2 ── build *pre‑tokenised* input & a mapping table ---------------------
    sents  = _sentences_from_pairs(pairs)            # list‑of‑sentences
    pretok = [[m[4] for m in sent] for sent in sents]  # token strings only

    # 3 ── parse with Stanza *pretok* mode -----------------------------------
    doc = nlp(pretok)                                # same shape as `sents`

    # 4 ── project back, keeping multi‑word tokens ---------------------------
    conllu_lines = []
    for sid, (sent_map, sent_doc) in enumerate(zip(sents, doc.sentences), 1):
        # comment lines
        raw_slice = [m[3]           for m in sent_map if m[1] == 0]     # first sub‑token per orig
        std_slice = [m[4]           for m in sent_map]                  # every sub‑token
        conllu_lines += [
            f"# sent_id = {sid}",
            f"# text = {' '.join(raw_slice)}",
            f"# text_standard = {' '.join(std_slice)}",
        ]

        # token lines
        widx = 0                                     # index in sent_doc.words
        tid  = 1                                     # running token ID in CONLL‑U
        for m in sent_map:
            orig_i, sub_i, n_sub, orig_tok, std_tok = m
            word = sent_doc.words[widx]

            if sub_i == 0 and n_sub > 1:             # multi‑word‑token header
                conllu_lines.append(f"{tid}-{tid+n_sub-1}\t{orig_tok}\t_\t_\t_\t_\t_\t_\t_\t_")

            # choose FORM for the sub‑token
            form = orig_tok if n_sub == 1 else std_tok

            conllu_lines.append("\t".join([
                str(tid),
                form,
                word.lemma or "_",
                word.upos  or "_",
                word.xpos  or "_",
                word.feats or "_",
                str(word.head) if word.head else "_",
                word.deprel or "_",
                "_",
                "_",
            ]))

            widx += 1
            tid  += 1
        conllu_lines.append("")                      # blank line between sents

    return "\n".join(conllu_lines)


In [3]:
def run_parse(raw_text):
    pairs      = standardise(raw_text, "ga")
    sentences  = naive_sentences(pairs)
    doc        = parse_standardised(sentences)
    conllu_out = project(doc, sentences)
    print(conllu_out)


In [6]:
print(project_with_stanza("Áindrías an Ime."))

# sent_id = 1
# text = Áindrías an Ime .
# text_standard = Aindrias an Ime .
1	Áindrías	Aindrias	PROPN	Noun	Definite=Def|Gender=Masc|Number=Sing	_	root	_	_
2	an	an	DET	Art	Definite=Def|Number=Sing|PronType=Art	3	det	_	_
3	Ime	ime	NOUN	Noun	Case=Nom|Definite=Def|Gender=Masc|Number=Sing	1	nmod	_	_
4	.	.	PUNCT	.	_	1	punct	_	_



In [7]:
print(project_with_stanza("áit an ime."))

# sent_id = 1
# text = áit an ime .
# text_standard = áit an ime .
1	áit	áit	NOUN	Noun	Case=Nom|Definite=Def|Gender=Fem|Number=Sing	_	root	_	_
2	an	an	DET	Art	Case=Gen|Definite=Def|Gender=Masc|Number=Sing|PronType=Art	3	det	_	_
3	ime	ime	NOUN	Noun	Case=Gen|Definite=Def|Gender=Masc|Number=Sing	1	nmod	_	_
4	.	.	PUNCT	.	_	1	punct	_	_



In [14]:
print(project_with_stanza("Bhí Áindrías an ime na chomhnaidhe i mBaile ui Mún i nGleann an Bhaile Dhuibh."))

# sent_id = 1
# text = Bhí Áindrías an ime na chomhnaidhe i mBaile ui Mún i nGleann an Bhaile Dhuibh .
# text_standard = Bhí Aindrias an ime ina chónaí i mBaile uí Mún i nGleann an Bhaile Dhuibh .
1	Bhí	bí	VERB	PastInd	Form=Len|Mood=Ind|Tense=Past	_	root	_	_
2	Áindrías	Aindrias	PROPN	Noun	Definite=Def|Gender=Masc|Number=Sing	1	nsubj	_	_
3	an	an	DET	Art	Definite=Def|Number=Sing|PronType=Art	4	det	_	_
4	ime	ime	NOUN	Noun	Case=Nom|Definite=Def|Gender=Masc|Number=Sing	2	nmod	_	_
5	na	i	ADP	Poss	Gender=Masc|Number=Sing|Person=3|Poss=Yes	6	case	_	_
6	chomhnaidhe	cónaí	NOUN	Noun	Case=Nom|Definite=Def|Form=Len|Gender=Masc|Number=Sing	1	xcomp:pred	_	_
7	i	i	ADP	Simp	_	8	case	_	_
8	mBaile	Baile	PROPN	Noun	Case=Nom|Definite=Def|Form=Ecl|Gender=Masc|Number=Sing	6	nmod	_	_
9	ui	uí	PART	Pat	PartType=Pat	8	flat	_	_
10	Mún	Mún	PROPN	Noun	Case=Gen|Definite=Def|Gender=Masc|Number=Sing	8	flat:name	_	_
11	i	i	ADP	Simp	_	12	case	_	_
12	nGleann	gleann	NOUN	Noun	Case=Nom|Definite=Def|Form=Ecl|Gender=Masc|Num

In [6]:
run_parse("Bu leis Baile ui Mún, áit fiche bó ⁊ tarbh leobhtha.")

# sent_id = 1
# text = Bu leis Baile ui Mún , áit fiche bó ⁊ tarbh leobhtha .
1	Bu	is	AUX	Cop	Tense=Past|VerbForm=Cop	3	cop	_	_
2	leis	le	ADP	Simp	_	3	case	_	_
3	Baile	Baile	PROPN	Noun	Definite=Def|Gender=Masc|Number=Sing	_	root	_	_
4	ui	uí	PART	Pat	PartType=Pat	3	flat:name	_	_
5	Mún	Mún	PROPN	Noun	Case=Gen|Definite=Def|Gender=Masc|Number=Sing	4	flat:name	_	_
6	,	,	PUNCT	Punct	_	7	punct	_	_
7	áit	áit	NOUN	Noun	Case=Nom|Gender=Fem|Number=Sing	3	appos	_	_
8	fiche	fiche	NUM	Num	NumType=Card	9	nummod	_	_
9	bó	bó	NOUN	Noun	Case=Nom|Gender=Fem|Number=Sing	7	nmod	_	_
10	⁊	⁊	ADP	Simp	_	11	case	_	_
11	tarbh	tarbh	NOUN	Noun	Case=Nom|Gender=Masc|Number=Sing	7	nmod	_	_
12	leobhtha	le	ADP	Prep	Number=Plur|Person=3	3	obl:prep	_	_
13	.	.	PUNCT	.	_	3	punct	_	_



In [11]:
sample = "Do bhíodh longphort ag na Lochlannaigh anseo. D’éirigh an t‑árd‑rí."
print(project_with_stanza("Bu leis Baile ui Mún, áit fiche bó agus tarbh leobhtha."))


# sent_id = 1
# text = Bu leis Baile ui Mún , áit fiche bó agus tarbh leobhtha .
# text_standard = Ba leis Baile uí Mún , áit fiche bó agus tarbh leo .
1	Bu	is	AUX	Cop	Tense=Past|VerbForm=Cop	3	cop	_	_
2	leis	le	ADP	Simp	_	3	case	_	_
3	Baile	Baile	PROPN	Noun	Definite=Def|Gender=Masc|Number=Sing	_	root	_	_
4	ui	uí	PART	Pat	PartType=Pat	3	flat:name	_	_
5	Mún	Mún	PROPN	Noun	Case=Gen|Definite=Def|Gender=Masc|Number=Sing	4	flat:name	_	_
6	,	,	PUNCT	Punct	_	7	punct	_	_
7	áit	áit	NOUN	Noun	Case=Nom|Gender=Fem|Number=Sing	3	appos	_	_
8	fiche	fiche	NUM	Num	NumType=Card	9	nummod	_	_
9	bó	bó	NOUN	Noun	Case=Nom|Gender=Fem|Number=Sing	7	nmod	_	_
10	agus	agus	CCONJ	Coord	_	11	cc	_	_
11	tarbh	tarbh	NOUN	Noun	Case=Nom|Gender=Masc|Number=Sing	3	conj	_	_
12	leobhtha	le	ADP	Prep	Number=Plur|Person=3	11	obl:prep	_	_
13	.	.	PUNCT	.	_	3	punct	_	_

