In [None]:
!pip install stanza

In [7]:
import urllib.parse, urllib.request, json, sys
import stanza

In [8]:
STD_API = "https://cadhan.com/api/intergaelic/3.0"

def standardise(text: str, lang: str = "ga"):
    """Return a list of (orig_tok, std_tok) pairs from Intergaelic."""
    data   = urllib.parse.urlencode({"foinse": lang, "teacs": text}).encode()
    hdrs   = {"Content-Type": "application/x-www-form-urlencoded",
              "Accept":        "application/json"}
    req    = urllib.request.Request(STD_API, data, hdrs)
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())

In [9]:
stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)

nlp = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    # Let Stanza decide sentences & tokens
    tokenize_pretokenized=False,
    verbose=False
)

In [10]:
def project_with_stanza(raw_text: str, lang: str = "ga") -> str:
    pairs        = standardise(raw_text, lang)
    std_tokens   = [std for _orig, std in pairs]
    std_text     = " ".join(std_tokens)        # one long string for Stanza
    doc          = nlp(std_text)               # Stanza finds sentences

    # Walk through Stanza words in order and align with the pairs list
    idx          = 0
    conllu_lines = []
    for sent_id, sent in enumerate(doc.sentences, 1):
        conllu_lines.append(f"# sent_id = {sent_id}")
        # Build the original sentence text for the comment line
        orig_sentence = " ".join(pairs[i][0] for i in range(idx, idx+len(sent.words)))
        conllu_lines.append(f"# text = {orig_sentence}")

        for wid, word in enumerate(sent.words, 1):
            orig_tok, std_tok = pairs[idx]
            if std_tok != word.text:           # rare but possible mismatch
                print(f"⚠️  token mismatch: “{std_tok}” vs “{word.text}” (will keep Stanza)")
            conllu_lines.append("\t".join([
                str(wid),
                orig_tok,                       # FORM  = original spelling
                word.lemma or "_",              # LEMMA = Stanza
                word.upos  or "_",
                word.xpos  or "_",
                word.feats or "_",
                str(word.head) if word.head else "_",
                word.deprel or "_",
                "_",
                "_",
            ]))
            idx += 1
        conllu_lines.append("")                # blank line between sentences
    return "\n".join(conllu_lines)

In [3]:
def run_parse(raw_text):
    pairs      = standardise(raw_text, "ga")
    sentences  = naive_sentences(pairs)
    doc        = parse_standardised(sentences)
    conllu_out = project(doc, sentences)
    print(conllu_out)


In [4]:
run_parse("Áindrías an Ime.")

# sent_id = 1
# text = Áindrías an Ime .
1	Áindrías	Aindrias	PROPN	Noun	Definite=Def|Gender=Masc|Number=Sing	_	root	_	_
2	an	an	DET	Art	Definite=Def|Number=Sing|PronType=Art	3	det	_	_
3	Ime	ime	NOUN	Noun	Case=Nom|Definite=Def|Gender=Masc|Number=Sing	1	nmod	_	_
4	.	.	PUNCT	.	_	1	punct	_	_



In [5]:
run_parse("Bhí Áindrías an Ime na chomhnaidhe i mBaile ui Mún i nGleann an Bhaile Dhuibh.")

# sent_id = 1
# text = Bhí Áindrías an Ime na chomhnaidhe i mBaile ui Mún i nGleann an Bhaile Dhuibh .
1	Bhí	bí	VERB	PastInd	Form=Len|Mood=Ind|Tense=Past	_	root	_	_
2	Áindrías	Aindrias	PROPN	Noun	Definite=Def|Gender=Masc|Number=Sing	1	nsubj	_	_
3	an	an	DET	Art	Definite=Def|Number=Sing|PronType=Art	4	det	_	_
4	Ime	ime	NOUN	Noun	Case=Nom|Definite=Def|Gender=Masc|Number=Sing	2	nmod	_	_
5	na	i	ADP	Poss	Gender=Masc|Number=Sing|Person=3|Poss=Yes	6	case	_	_
6	chomhnaidhe	cónaí	NOUN	Noun	Case=Nom|Definite=Def|Form=Len|Gender=Masc|Number=Sing	1	xcomp:pred	_	_
7	i	i	ADP	Simp	_	8	case	_	_
8	mBaile	Baile	PROPN	Noun	Case=Nom|Definite=Def|Form=Ecl|Gender=Masc|Number=Sing	6	nmod	_	_
9	ui	uí	PART	Pat	PartType=Pat	8	flat	_	_
10	Mún	Mún	PROPN	Noun	Case=Gen|Definite=Def|Gender=Masc|Number=Sing	8	flat:name	_	_
11	i	i	ADP	Simp	_	12	case	_	_
12	nGleann	gleann	NOUN	Noun	Case=Nom|Definite=Def|Form=Ecl|Gender=Masc|Number=Sing	1	obl	_	_
13	an	an	DET	Art	Case=Gen|Definite=Def|Gender=Masc|Number=Sing|PronType=Art

In [6]:
run_parse("Bu leis Baile ui Mún, áit fiche bó ⁊ tarbh leobhtha.")

# sent_id = 1
# text = Bu leis Baile ui Mún , áit fiche bó ⁊ tarbh leobhtha .
1	Bu	is	AUX	Cop	Tense=Past|VerbForm=Cop	3	cop	_	_
2	leis	le	ADP	Simp	_	3	case	_	_
3	Baile	Baile	PROPN	Noun	Definite=Def|Gender=Masc|Number=Sing	_	root	_	_
4	ui	uí	PART	Pat	PartType=Pat	3	flat:name	_	_
5	Mún	Mún	PROPN	Noun	Case=Gen|Definite=Def|Gender=Masc|Number=Sing	4	flat:name	_	_
6	,	,	PUNCT	Punct	_	7	punct	_	_
7	áit	áit	NOUN	Noun	Case=Nom|Gender=Fem|Number=Sing	3	appos	_	_
8	fiche	fiche	NUM	Num	NumType=Card	9	nummod	_	_
9	bó	bó	NOUN	Noun	Case=Nom|Gender=Fem|Number=Sing	7	nmod	_	_
10	⁊	⁊	ADP	Simp	_	11	case	_	_
11	tarbh	tarbh	NOUN	Noun	Case=Nom|Gender=Masc|Number=Sing	7	nmod	_	_
12	leobhtha	le	ADP	Prep	Number=Plur|Person=3	3	obl:prep	_	_
13	.	.	PUNCT	.	_	3	punct	_	_



In [11]:
sample = "Do bhíodh longphort ag na Lochlannaigh anseo. D’éirigh an t‑árd‑rí."
print(project_with_stanza(sample))


IndexError: list index out of range