In [1]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [2]:
import sys
import json
import urllib.parse
import urllib.request
import stanza
from itertools import groupby

STD_API = "https://cadhan.com/api/intergaelic/3.0"

def standardise(text: str, lang: str):
    """
    Return a list of (orig_tok, std_tok) pairs from the Intergaelic API.
    """
    params = urllib.parse.urlencode({"foinse": lang, "teacs": text}).encode()
    headers = {"Content-Type": "application/x-www-form-urlencoded",
               "Accept":        "application/json"}
    req = urllib.request.Request(STD_API, params, headers=headers)
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())

def naive_sentences(pairs):
    """
    Split the pair list into sentences whenever a token ends in . ! or ? .
    Stanza will be run in *tokenise‑pretokenised* mode, so sentences are
    lists‑of‑lists of *standardised* tokens.
    """
    sentence, current = [], []
    for orig, std in pairs:
        current.append((orig, std))
        if std.endswith((".", "!", "?")):
            sentence.append(current)
            current = []
    if current:
        sentence.append(current)
    return sentence

def parse_standardised(sentences):
    """
    Run Stanza’s Irish pipeline on the already‑tokenised sentences and return
    the resulting Document object.
    """
    stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)

    nlp = stanza.Pipeline(
        lang="ga",
        processors="tokenize,pos,lemma,depparse",
        tokenize_pretokenized=True,
        verbose=False,
    )
    std_sents = [[std for _orig, std in sent] for sent in sentences]
    return nlp(std_sents)

def project(doc, sentences):
    """
    Yield CONLL‑U lines containing the original token but the Stanza annotation
    (lemma, UPOS, feats, head, deprel, …) taken from the aligned standard token.
    Assumes one‑to‑one alignment, which Scannell (2022) reports holds for ≈97 %
    of tokens; multi‑word tokens and many‑to‑one cases need extra work.
    """
    conllu_lines = []
    sent_id = 1
    for sent_pairs, sent_ann in zip(sentences, doc.sentences):
        conllu_lines.append(f"# sent_id = {sent_id}")
        conllu_lines.append(f"# text = {' '.join(orig for orig, _ in sent_pairs)}")
        for i, (pair, word) in enumerate(zip(sent_pairs, sent_ann.words), start=1):
            orig_tok, _ = pair
            # CoNLL‑U columns: ID, FORM, LEMMA, UPOS, XPOS, FEATS,
            #                  HEAD, DEPREL, DEPS, MISC
            line = [
                str(i),
                orig_tok,
                word.lemma or "_",
                word.upos  or "_",
                word.xpos  or "_",
                word.feats or "_",
                str(word.head) if word.head else "_",
                word.deprel or "_",
                "_",
                "_",
            ]
            conllu_lines.append("\t".join(line))
        conllu_lines.append("")   # blank line between sentences
        sent_id += 1
    return "\n".join(conllu_lines)

In [3]:
def run_parse(raw_text):
    pairs      = standardise(raw_text, "ga")
    sentences  = naive_sentences(pairs)
    doc        = parse_standardised(sentences)
    conllu_out = project(doc, sentences)
    print(conllu_out)


In [4]:
run_parse("Áindrías an Ime.")

# sent_id = 1
# text = Áindrías an Ime .
1	Áindrías	Aindrias	PROPN	Noun	Definite=Def|Gender=Masc|Number=Sing	_	root	_	_
2	an	an	DET	Art	Definite=Def|Number=Sing|PronType=Art	3	det	_	_
3	Ime	ime	NOUN	Noun	Case=Nom|Definite=Def|Gender=Masc|Number=Sing	1	nmod	_	_
4	.	.	PUNCT	.	_	1	punct	_	_

