In [1]:
%%capture
!pip install stanza

In [2]:
import urllib.parse, urllib.request, json, sys
import stanza

In [3]:
STD_API = "https://cadhan.com/api/intergaelic/3.0"

def standardise(text: str, lang: str = "ga"):
    """Return a list of (orig_tok, std_tok) pairs from Intergaelic."""
    data   = urllib.parse.urlencode({"foinse": lang, "teacs": text}).encode()
    hdrs   = {"Content-Type": "application/x-www-form-urlencoded",
              "Accept":        "application/json"}
    req    = urllib.request.Request(STD_API, data, hdrs)
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())

In [10]:
standardise("Do bhris me e.")

[['Do bhris', 'Bhris'], ['me', 'mé'], ['e', 'é'], ['.', '.']]

In [14]:
stanza.download("ga", processors="tokenize,pos,lemma,depparse", verbose=False)

nlp = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    # Let Stanza decide sentences & tokens
    tokenize_pretokenized=True,
    no_ssplit=True,
    verbose=False
)

In [6]:
from itertools import groupby
from typing import List, Tuple

def _split_std(std: str, orig: str) -> List[str]:
    """Return the token(s) that should feed Stanza for this pair."""
    if not std.strip():
        return [orig]
    return std.split()

def _sentences_from_pairs(pairs: List[Tuple[str, str]]):
    """Very light sentence splitter: keep everything up to . ! ?"""
    sent, buf = [], []
    for i, (orig, std) in enumerate(pairs):
        parts = _split_std(std, orig)
        for j, part in enumerate(parts):
            buf.append((i, j, len(parts), orig, part))
            if part in {".", "!", "?"}:
                sent.append(buf);  buf = []
    if buf:
        sent.append(buf)
    return sent

def project_with_stanza(raw_text: str, lang: str = "ga") -> str:
    pairs  = standardise(raw_text, lang)

    sents  = _sentences_from_pairs(pairs)
    pretok = [[m[4] for m in sent] for sent in sents]

    doc = nlp(pretok)

    conllu_lines = []
    for sid, (sent_map, sent_doc) in enumerate(zip(sents, doc.sentences), 1):
        raw_slice = [m[3] for m in sent_map if m[1] == 0]
        std_slice = [m[4] for m in sent_map]
        conllu_lines += [
            f"# sent_id = {sid}",
            f"# text = {' '.join(raw_slice)}",
            f"# text_standard = {' '.join(std_slice)}",
        ]

        # token lines
        widx = 0
        tid  = 1
        for m in sent_map:
            orig_i, sub_i, n_sub, orig_tok, std_tok = m
            word = sent_doc.words[widx]

            if sub_i == 0 and n_sub > 1:
                conllu_lines.append(f"{tid}-{tid+n_sub-1}\t{orig_tok}\t_\t_\t_\t_\t_\t_\t_\t_")

            form = orig_tok if n_sub == 1 else std_tok

            conllu_lines.append("\t".join([
                str(tid),
                form,
                word.lemma or "_",
                word.upos  or "_",
                word.xpos  or "_",
                word.feats or "_",
                str(word.head) if word.head else "_",
                word.deprel or "_",
                "_",
                "_",
            ]))

            widx += 1
            tid  += 1
        conllu_lines.append("")

    return "\n".join(conllu_lines)


In [7]:
PAGE = """Maidin amháin, chuaigh gamal as Gúra amach chun
connadh a bhailiú. Shiúil sé leis thar an machaire gur tháinig
sé go dtí crann mór ológ cois abhann.

Dhreap sé an crann agus shuigh sé faoi ar an ngéag ba mhó ar
an gcrann. Nuair a bhraith sé ar a shocracht ar an ngéag,
thosaigh sé á bualadh le tua.

Ghabh an sagart thar bráid. D'fhéach sé in airde agus labhair
leis an ngamal:

"A dheartháirín ó, cad atá ar bun agat?" ar sé os ard. "Ní mar
sin a ghearrtar adhmad!"

"Níl aon bhealach eile ann, a Athair," arsa an gamal.
"Caithfear an tua a ardú agus é a bhualadh anuas ar an
adhmad!"

"Ach tá tú i do shuí ar an ngéag atá á gearradh agat! Brisfidh
an ghéag, agus nuair a bhrisfidh titfidh tusa go talamh agus
marófar thú,” arsa an sagart.
"""

In [11]:
paras = [x.replace("\n", " ") for x in PAGE.split("\n\n")]

In [12]:
paras

['Maidin amháin, chuaigh gamal as Gúra amach chun connadh a bhailiú. Shiúil sé leis thar an machaire gur tháinig sé go dtí crann mór ológ cois abhann.',
 'Dhreap sé an crann agus shuigh sé faoi ar an ngéag ba mhó ar an gcrann. Nuair a bhraith sé ar a shocracht ar an ngéag, thosaigh sé á bualadh le tua.',
 "Ghabh an sagart thar bráid. D'fhéach sé in airde agus labhair leis an ngamal:",
 '"A dheartháirín ó, cad atá ar bun agat?" ar sé os ard. "Ní mar sin a ghearrtar adhmad!"',
 '"Níl aon bhealach eile ann, a Athair," arsa an gamal. "Caithfear an tua a ardú agus é a bhualadh anuas ar an adhmad!"',
 '"Ach tá tú i do shuí ar an ngéag atá á gearradh agat! Brisfidh an ghéag, agus nuair a bhrisfidh titfidh tusa go talamh agus marófar thú,” arsa an sagart. ']

In [8]:
nlp_tok = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,lemma,depparse",
    # Let Stanza decide sentences & tokens
    tokenize_pretokenized=False,
    verbose=False
)

In [21]:
pp = project_with_stanza("E-, ‘firing range’ a mbíonns acub agus é seo agus é siúd.")

In [22]:
print(pp)

# sent_id = 1
# text = E - , ‘ firing range ’ a mbíonns acub agus é seo agus é siúd .
# text_standard = É - , ‘ firing range ’ a mbíonn acu agus é seo agus é siúd .
1	E	é	PRON	Pers	Gender=Masc|Number=Sing|Person=3	_	root	_	_
2	-	-	PUNCT	.	_	4	punct	_	_
3	,	,	PUNCT	Punct	_	4	punct	_	_
4	‘	‘	X	Foreign	Foreign=Yes	1	parataxis	_	_
5	firing	firing	X	Foreign	Foreign=Yes	4	flat:foreign	_	_
6	range	range	X	Foreign	Foreign=Yes	4	flat:foreign	_	_
7	’	’	PUNCT	Punct	_	9	punct	_	_
8	a	a	PART	Vb	Form=Indirect|PartType=Vb|PronType=Rel	9	mark:prt	_	_
9	mbíonns	bí	VERB	PresImp	Aspect=Hab|Form=Ecl|Mood=Ind|Tense=Pres	4	acl:relcl	_	_
10	acub	ag	ADP	Prep	Number=Plur|Person=3	9	obl:prep	_	_
11	agus	agus	CCONJ	Coord	_	12	cc	_	_
12	é	é	PRON	Pers	Gender=Masc|Number=Sing|Person=3	1	conj	_	_
13	seo	seo	PRON	Dem	PronType=Dem	12	det	_	_
14	agus	agus	SCONJ	Subord	_	15	mark	_	_
15	é	é	PRON	Pers	Gender=Masc|Number=Sing|Person=3	12	conj	_	_
16	siúd	siúd	PRON	Dem	PronType=Dem	15	det	_	_
17	.	.	PUNCT	.	_	1	punct	_	_



In [None]:
nlp_tok("E-, ‘firing range’ a mbíonns acu agus é seo agus é siúd.")

In [57]:
from stanza.pipeline.processor import ProcessorVariant, Processor, register_processor_variant, register_processor
from stanza.pipeline._constants import *

@register_processor("custompos")
class CustomPOSProcessor(Processor):
    REQUI
    _provides = set(['pos'])
    def __init__(self,  device, config, pipeline):
        super().__init__(device, config, pipeline)
        # Custom POS/morphological data
        self.pos_data = {
            "e-": {"upos": "INTJ", "xpos": "Itj"},
            "u-": {"upos": "INTJ", "xpos": "Itj"}
        }

    def process(self, doc):
        for sentence in doc.sentences:
            for word in sentence.words:
                if word.text in self.pos_data:
                    word.upos = self.pos_data[word.text]["upos"]
                    word.xpos = self.pos_data[word.text]["xpos"]
                    if "feats" in self.pos_data[word.text]:
                        word.feats = self.pos_data[word.text]["feats"]
        return doc


In [58]:
nlp_custom = stanza.Pipeline(
    lang="ga",
    processors="tokenize,pos,custompos,lemma,depparse",
    tokenize_pretokenized=False,
    verbose=False,
    pos={"variant": "custompos"}
)

AttributeError: type object 'CustomPOSProcessor' has no attribute 'PROVIDES_DEFAULT'

In [51]:
nlp_custom("E-, ‘firing range’ a mbíonns acu agus é seo agus é siúd.")

[
  [
    {
      "id": 1,
      "text": "E-",
      "lemma": "E-",
      "upos": "NUM",
      "xpos": "Num",
      "head": 0,
      "deprel": "root",
      "start_char": 0,
      "end_char": 2,
      "misc": "SpaceAfter=No"
    },
    {
      "id": 2,
      "text": ",",
      "lemma": ",",
      "upos": "PUNCT",
      "xpos": "Punct",
      "head": 3,
      "deprel": "punct",
      "start_char": 2,
      "end_char": 3
    },
    {
      "id": 3,
      "text": "‘",
      "lemma": "‘",
      "upos": "X",
      "xpos": "Foreign",
      "feats": "Foreign=Yes",
      "head": 1,
      "deprel": "parataxis",
      "start_char": 4,
      "end_char": 5,
      "misc": "SpaceAfter=No"
    },
    {
      "id": 4,
      "text": "firing",
      "lemma": "firing",
      "upos": "X",
      "xpos": "Foreign",
      "feats": "Foreign=Yes",
      "head": 3,
      "deprel": "flat:foreign",
      "start_char": 5,
      "end_char": 11
    },
    {
      "id": 5,
      "text": "range",
      "lemma": "range