In [1]:
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
    NEMO_DIGIT,
    NEMO_SIGMA,
    NEMO_SPACE,
    NEMO_WHITE_SPACE,
    GraphFst,
    delete_space,
    insert_space,
)
from nemo_text_processing.text_normalization.pl.graph_utils import PL_ALPHA
from nemo_text_processing.text_normalization.pl.utils import get_abs_path, load_labels
from nemo_text_processing.text_normalization.pl.taggers.ordinal import adjective_inflection, complete_paradigm
from pynini.lib import pynutil


In [2]:
def make_million(number: str, non_zero_pl: 'pynini.FstLike', non_zero_quant: 'pynini.FstLike', case: str = None, deterministic: bool = True) -> 'pynini.FstLike':
    """
    Helper function for thousands/millions/milliards and higher
    Args:
        number: the string of the number
        non_zero_pl: An fst of digits excluding 0, 1, 5-9, to prefix to plural forms (nom/acc)
        non_zero_quant: An fst of digits excluding 0 and 1-4, to prefix to the quantity forms (nom/acc)
        case: the string of the case (if None, nominative/accusative is presumed)
        deterministic: if True, generate a deterministic fst

    Returns:
        graph: A pynini.FstLike object
    """
    if case is None:
        sg_end = ""
        pl_end = "y"
        quant_end = "ów"
        one = "jeden"
    else:
        SG = {
            "loc": "ie",
            "ins": "em",
            "dat": "owi",
            "gen": "a",
        }
        PL = {
            "loc": "ach",
            "ins": "ami",
            "dat": "om",
            "gen": "ów",
        }
        ONE = {
            "loc": "jednym",
            "ins": "jednym",
            "dat": "jednemu",
            "gen": "jednego",
        }
        sg_end = SG[case]
        pl_end = PL[case]
        one = ONE[case]
        quant_end = pl_end
        if case == "loc" and number.endswith("ard"):
            sg_end = "zie"
    graph = pynutil.add_weight(pynini.cross("001", f"{number}{sg_end}"), -0.001)
    if not deterministic:
        graph |= pynutil.add_weight(pynini.cross("001", f"{one} {number}{sg_end}"), -0.001)
    graph |= non_zero_pl + pynutil.insert(f" {number}{pl_end}")
    # hack for the stem change in tysiąc (1000)
    if number == "tysiąc":
        number = "tysięc"
    graph |= non_zero_quant + pynutil.insert(f" {number}{quant_end}")
    graph |= pynutil.delete("000")
    graph += insert_space
    return graph

In [3]:
def make_inflected_graph_dict(file_path: str, cross: str, deterministic=False) -> dict:
    graph_dict = {}
    for line in load_labels(get_abs_path(file_path)):
        key, value = line[0], line[1]
        if key not in graph_dict:
            graph_dict[key] = pynini.cross(cross, value)
        else:
            if not deterministic:
                graph_dict[key] |= pynini.cross(cross, value)
    return graph_dict


def get_nominal_inflections(inflection_file, noun_file):
    output = {}
    inflections = {a[0]: a[1] for a in load_labels(get_abs_path(inflection_file))}
    digit_noun_tsv = load_labels(get_abs_path(noun_file))
    for digit_noun in digit_noun_tsv:
        word = digit_noun[0]
        digit = digit_noun[1]
        lemma_ending = inflections["sg_nom"]
        assert word.endswith(lemma_ending), f"Word {word} does not end with {lemma_ending}"
        stem = word[:-len(lemma_ending)]
        wordforms = {k: stem + v for k, v in inflections.items()}
        output[digit] = wordforms
    return output

In [4]:
def get_nominal_graph(inflection_file, noun_file):
    output = {}
    input = get_nominal_inflections(inflection_file, noun_file)
    for item in input:
        for key in input[item]:
            if not key in output:
                output[key] = pynini.cross(item, input[item][key])
            else:
                output[key] |= pynini.cross(item, input[item][key])
    return output

In [5]:
deterministic = True  # Set to False for non-deterministic graphs

jeden_all = adjective_inflection("jeden")
jeden_graph = pynini.cross("1", jeden_all["mi_sg_nom"])
# in compound numbers, jeden does not inflect
jeden_only = pynini.cross("1", jeden_all["mi_sg_nom"])
if not deterministic:
    for key in jeden_all:
        if key == "mi_sg_nom":
            continue
        jeden_graph |= pynini.cross("1", jeden_all[key])
complete_paradigm(jeden_all)
self_jeden_all = {a[0]: pynini.cross("1", a[1]) for a in jeden_all.items()}
self_zero_all = get_nominal_graph("data/grammar/noun_nt_ro.tsv", "data/numbers/zero.tsv")

cases = ["nom", "gen", "dat", "acc", "ins", "loc", "voc"]
jeden_filt = {}
for case in cases:
    jeden_filt[case] = self_jeden_all[f'mi_sg_{case}']

In [11]:
("0" @ self_zero_all["sg_gen"]).string()

'zera'

In [12]:
digit_noun_tsv = load_labels(get_abs_path("data/grammar/noun_nt_ro.tsv"))

In [18]:
noun_labels = [x[0] for x in digit_noun_tsv]

In [None]:
TYS = """
nominative	miliard	miliardy
genitive	miliarda	miliardów
dative	miliardowi	miliardom
accusative	miliard	miliardy
instrumental	miliardem	miliardami
locative	miliardzie	miliardach
vocative	miliardzie	miliardy
"""
sg = []
pl = []
for line in TYS.strip().split("\n"):
    if not "\t" in line:
        continue
    line = line.replace("miliard", "")
    parts = line.split("\t")
    if len(parts) != 3:
        continue
    sg.append(parts[1])
    pl.append(parts[2])
forms = sg + pl


In [21]:
forms

['', 'u', 'owi', '', 'em', 'ie', 'ie', 'y', 'ów', 'om', 'y', 'ami', 'ach', 'y']

In [22]:
with open(get_abs_path("data/grammar/noun_mi_n.tsv"), "w") as f:
    for i in zip(noun_labels, forms):
        f.write(f"{i[0]}\t{i[1]}\n")