In [None]:
%%capture
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [None]:
import urllib.parse, urllib.request, json, sys
import stanza

In [None]:
STD_API = "https://cadhan.com/api/intergaelic/3.0"

def standardise(text: str, lang: str = "ga"):
    """Return a list of (orig_tok, std_tok) pairs from Intergaelic."""
    data   = urllib.parse.urlencode({"foinse": lang, "teacs": text}).encode()
    hdrs   = {"Content-Type": "application/x-www-form-urlencoded",
              "Accept":        "application/json"}
    req    = urllib.request.Request(STD_API, data, hdrs)
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())

In [None]:
stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)

nlp = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    # Let Stanza decide sentences & tokens
    tokenize_pretokenized=False,
    verbose=False
)

In [None]:
# Cell ▸ robust projection with multi‑word support
# -----------------------------------------------
from itertools import groupby
from typing import List, Tuple

def _split_std(std: str, orig: str) -> List[str]:
    """Return the token(s) that should feed Stanza for this pair."""
    if not std.strip():                         # e.g. Do → ""  → keep 'Do'
        return [orig]
    return std.split()                          # may yield 1‑N tokens

def _sentences_from_pairs(pairs: List[Tuple[str, str]]):
    """Very light sentence splitter: keep everything up to . ! ?"""
    sent, buf = [], []
    for i, (orig, std) in enumerate(pairs):
        parts = _split_std(std, orig)
        for j, part in enumerate(parts):
            buf.append((i, j, len(parts), orig, part))  # mapping entry
            if part in {".", "!", "?"}:
                sent.append(buf);  buf = []
    if buf:
        sent.append(buf)
    return sent                                       # [[mapping …], …]

def project_with_stanza(raw_text: str, lang: str = "ga") -> str:
    # 1 ── standardise -------------------------------------------------------
    pairs  = standardise(raw_text, lang)             # [(orig, std), …]

    # 2 ── build *pre‑tokenised* input & a mapping table ---------------------
    sents  = _sentences_from_pairs(pairs)            # list‑of‑sentences
    pretok = [[m[4] for m in sent] for sent in sents]  # token strings only

    # 3 ── parse with Stanza *pretok* mode -----------------------------------
    doc = nlp(pretok)                                # same shape as `sents`

    # 4 ── project back, keeping multi‑word tokens ---------------------------
    conllu_lines = []
    for sid, (sent_map, sent_doc) in enumerate(zip(sents, doc.sentences), 1):
        # comment lines
        raw_slice = [m[3]           for m in sent_map if m[1] == 0]     # first sub‑token per orig
        std_slice = [m[4]           for m in sent_map]                  # every sub‑token
        conllu_lines += [
            f"# sent_id = {sid}",
            f"# text = {' '.join(raw_slice)}",
            f"# text_standard = {' '.join(std_slice)}",
        ]

        # token lines
        widx = 0                                     # index in sent_doc.words
        tid  = 1                                     # running token ID in CONLL‑U
        for m in sent_map:
            orig_i, sub_i, n_sub, orig_tok, std_tok = m
            word = sent_doc.words[widx]

            if sub_i == 0 and n_sub > 1:             # multi‑word‑token header
                conllu_lines.append(f"{tid}-{tid+n_sub-1}\t{orig_tok}\t_\t_\t_\t_\t_\t_\t_\t_")

            # choose FORM for the sub‑token
            form = orig_tok if n_sub == 1 else std_tok

            conllu_lines.append("\t".join([
                str(tid),
                form,
                word.lemma or "_",
                word.upos  or "_",
                word.xpos  or "_",
                word.feats or "_",
                str(word.head) if word.head else "_",
                word.deprel or "_",
                "_",
                "_",
            ]))

            widx += 1
            tid  += 1
        conllu_lines.append("")                      # blank line between sents

    return "\n".join(conllu_lines)


In [None]:
def run_parse(raw_text):
    pairs      = standardise(raw_text, "ga")
    sentences  = naive_sentences(pairs)
    doc        = parse_standardised(sentences)
    conllu_out = project(doc, sentences)
    print(conllu_out)


In [None]:
run_parse("Áindrías an Ime.")

# sent_id = 1
# text = Áindrías an Ime .
1	Áindrías	Aindrias	PROPN	Noun	Definite=Def|Gender=Masc|Number=Sing	_	root	_	_
2	an	an	DET	Art	Definite=Def|Number=Sing|PronType=Art	3	det	_	_
3	Ime	ime	NOUN	Noun	Case=Nom|Definite=Def|Gender=Masc|Number=Sing	1	nmod	_	_
4	.	.	PUNCT	.	_	1	punct	_	_



In [None]:
run_parse("Bhí Áindrías an Ime na chomhnaidhe i mBaile ui Mún i nGleann an Bhaile Dhuibh.")

# sent_id = 1
# text = Bhí Áindrías an Ime na chomhnaidhe i mBaile ui Mún i nGleann an Bhaile Dhuibh .
1	Bhí	bí	VERB	PastInd	Form=Len|Mood=Ind|Tense=Past	_	root	_	_
2	Áindrías	Aindrias	PROPN	Noun	Definite=Def|Gender=Masc|Number=Sing	1	nsubj	_	_
3	an	an	DET	Art	Definite=Def|Number=Sing|PronType=Art	4	det	_	_
4	Ime	ime	NOUN	Noun	Case=Nom|Definite=Def|Gender=Masc|Number=Sing	2	nmod	_	_
5	na	i	ADP	Poss	Gender=Masc|Number=Sing|Person=3|Poss=Yes	6	case	_	_
6	chomhnaidhe	cónaí	NOUN	Noun	Case=Nom|Definite=Def|Form=Len|Gender=Masc|Number=Sing	1	xcomp:pred	_	_
7	i	i	ADP	Simp	_	8	case	_	_
8	mBaile	Baile	PROPN	Noun	Case=Nom|Definite=Def|Form=Ecl|Gender=Masc|Number=Sing	6	nmod	_	_
9	ui	uí	PART	Pat	PartType=Pat	8	flat	_	_
10	Mún	Mún	PROPN	Noun	Case=Gen|Definite=Def|Gender=Masc|Number=Sing	8	flat:name	_	_
11	i	i	ADP	Simp	_	12	case	_	_
12	nGleann	gleann	NOUN	Noun	Case=Nom|Definite=Def|Form=Ecl|Gender=Masc|Number=Sing	1	obl	_	_
13	an	an	DET	Art	Case=Gen|Definite=Def|Gender=Masc|Number=Sing|PronType=Art

In [None]:
run_parse("Bu leis Baile ui Mún, áit fiche bó ⁊ tarbh leobhtha.")

# sent_id = 1
# text = Bu leis Baile ui Mún , áit fiche bó ⁊ tarbh leobhtha .
1	Bu	is	AUX	Cop	Tense=Past|VerbForm=Cop	3	cop	_	_
2	leis	le	ADP	Simp	_	3	case	_	_
3	Baile	Baile	PROPN	Noun	Definite=Def|Gender=Masc|Number=Sing	_	root	_	_
4	ui	uí	PART	Pat	PartType=Pat	3	flat:name	_	_
5	Mún	Mún	PROPN	Noun	Case=Gen|Definite=Def|Gender=Masc|Number=Sing	4	flat:name	_	_
6	,	,	PUNCT	Punct	_	7	punct	_	_
7	áit	áit	NOUN	Noun	Case=Nom|Gender=Fem|Number=Sing	3	appos	_	_
8	fiche	fiche	NUM	Num	NumType=Card	9	nummod	_	_
9	bó	bó	NOUN	Noun	Case=Nom|Gender=Fem|Number=Sing	7	nmod	_	_
10	⁊	⁊	ADP	Simp	_	11	case	_	_
11	tarbh	tarbh	NOUN	Noun	Case=Nom|Gender=Masc|Number=Sing	7	nmod	_	_
12	leobhtha	le	ADP	Prep	Number=Plur|Person=3	3	obl:prep	_	_
13	.	.	PUNCT	.	_	3	punct	_	_



In [None]:
sample = "Do bhíodh longphort ag na Lochlannaigh anseo. D’éirigh an t‑árd‑rí."
print(project_with_stanza(sample))


# sent_id = 1
# text = Do bhíodh longphort ag na Lochlannaigh anseo .
# text_standard = Bhíodh longfort ag na Lochlannaigh anseo .
1	Do bhíodh	bí	VERB	PastImp	Aspect=Imp|Form=Len|Tense=Past	_	root	_	_
2	longphort	longfort	NOUN	Noun	Case=Nom|Gender=Masc|Number=Sing	1	nsubj	_	_
3	ag	ag	ADP	Simp	_	5	case	_	_
4	na	an	DET	Art	Definite=Def|Number=Plur|PronType=Art	5	det	_	_
5	Lochlannaigh	lochlannach	NOUN	Noun	Case=Nom|Definite=Def|Gender=Masc|Number=Plur	1	obl	_	_
6	anseo	anseo	ADV	Loc	_	1	advmod	_	_
7	.	.	PUNCT	.	_	1	punct	_	_

# sent_id = 2
# text = D'éirigh an t-árd-rí .
# text_standard = D'éirigh an t-ard-rí .
1	D'éirigh	do	PART	Vb	PartType=Vb	2	mark:prt	_	_
2	an	éirigh	VERB	VI	Mood=Ind|Tense=Past	_	root	_	_
3	t-árd-rí	an	DET	Art	Definite=Def|Number=Sing|PronType=Art	4	det	_	_
4	.	arda	NOUN	Noun	Case=Nom|Definite=Def|Gender=Masc|Number=Sing	2	nsubj	_	_
5	.	.	PUNCT	.	_	2	punct	_	_

