In [1]:
if 'google.colab' in str(get_ipython()):
  !pip install -q condacolab
  import condacolab
  condacolab.install()

⏬ Downloading https://github.com/jaimergp/miniforge/releases/latest/download/Mambaforge-colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:30
🔁 Restarting kernel...


In [1]:
%%capture
if 'google.colab' in str(get_ipython()):
  ! conda install -c conda-forge pynini=2.1.3

In [2]:
import string
import pynini
from pynini import Far
from pynini.export import export
from pynini.examples import plurals
from pynini.lib import byte, pynutil, utf8
NEMO_CHAR = utf8.VALID_UTF8_CHAR
NEMO_DIGIT = byte.DIGIT
NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize()
NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
NEMO_SIGMA = pynini.closure(NEMO_CHAR)
NEMO_SPACE = " "
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
insert_space = pynutil.insert(" ")


In [3]:
from pathlib import Path
import os

In [4]:
class GraphFst:
    """
    Base class for all grammar fsts.

    Args:
        name: name of grammar class
        kind: either 'classify' or 'verbalize'
        deterministic: if True will provide a single transduction option,
            for False multiple transduction are generated (used for audio-based normalization)
    """

    def __init__(self, name: str, kind: str, deterministic: bool = True):
        self.name = name
        self.kind = str
        self._fst = None
        self.deterministic = deterministic

        self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far')
        if self.far_exist():
            self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()

    def far_exist(self) -> bool:
        """
        Returns true if FAR can be loaded
        """
        return self.far_path.exists()

    @property
    def fst(self) -> 'pynini.FstLike':
        return self._fst

    @fst.setter
    def fst(self, fst):
        self._fst = fst

    def add_tokens(self, fst) -> 'pynini.FstLike':
        """
        Wraps class name around to given fst

        Args:
            fst: input fst

        Returns:
            Fst: fst
        """
        return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")

    def delete_tokens(self, fst) -> 'pynini.FstLike':
        """
        Deletes class name wrap around output of given fst

        Args:
            fst: input fst

        Returns:
            Fst: fst
        """
        res = (
            pynutil.delete(f"{self.name}")
            + delete_space
            + pynutil.delete("{")
            + delete_space
            + fst
            + delete_space
            + pynutil.delete("}")
        )
        return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)


In [5]:
!mkdir -p data/numbers/

In [6]:
%%writefile data/numbers/zero.tsv
noll	0

Writing data/numbers/zero.tsv


In [7]:
%%writefile data/numbers/ties.tsv
tjugo	2
trettio	3
fyrtio	4
femtio	5
sextio	6
sjuttio	7
åttio	8
nittio	9

Writing data/numbers/ties.tsv


In [8]:
%%writefile data/numbers/digit.tsv
ett	1
två	2
tre	3
fyra	4
fem	5
sex	6
sju	7
åtta	8
nio	9

Writing data/numbers/digit.tsv


In [9]:
%%writefile data/numbers/teen.tsv
tio	10
elva	11
tolv	12
tretton	13
fjorton	14
femton	15
sexton	16
sjutton	17
arton	18
nitton	19

Writing data/numbers/teen.tsv


In [10]:
def get_abs_path(rel_path):
    """
    Get absolute path

    Args:
        rel_path: relative path to this file
        
    Returns absolute path
    """
    abs_path = "/content" + os.sep + rel_path

    if not os.path.exists(abs_path):
        print(f'{abs_path} does not exist')
    return abs_path


In [11]:
zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/zero.tsv")))
digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv")))
teen = pynini.invert(pynini.string_file(get_abs_path("data/numbers/teen.tsv")))
ties = pynini.invert(pynini.string_file(get_abs_path("data/numbers/ties.tsv")))


In [12]:
def filter_punctuation(fst):
    exactly_three_digits = NEMO_DIGIT ** 3  # for blocks of three
    up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3)  # for start of string

    cardinal_separator = NEMO_SPACE
    cardinal_string = pynini.closure(
        NEMO_DIGIT, 1
    )  # For string w/o punctuation (used for page numbers, thousand series)

    cardinal_string |= (
        up_to_three_digits
        + pynutil.delete(cardinal_separator)
        + pynini.closure(exactly_three_digits + pynutil.delete(cardinal_separator))
        + exactly_three_digits
    )

    return cardinal_string @ fst


In [13]:
deterministic = True

In [14]:
graph_digit = digit
digits_no_one = (NEMO_DIGIT - "1") @ graph_digit
both_ones = (pynini.cross("1", "en") | pynini.cross("1", "ett"))
if deterministic:
    final_digit = digit
else:
    final_digit = digits_no_one | both_ones


In [15]:
graph_tens = teen
if deterministic:
    graph_tens |= ties + (pynutil.delete('0') | graph_digit)
    final_tens = graph_tens
else:
    graph_tens |= ties + (pynutil.delete('0') | (graph_digit | pynutil.insert(' ') + graph_digit))
    final_tens = graph_tens
    final_tens |= ties + (pynutil.delete('0') | (final_digit | pynutil.insert(' ') + final_digit))


In [20]:
graph_digit = digit
digits_no_one = (NEMO_DIGIT - "1") @ graph_digit
both_ones = (pynini.cross("1", "en") | pynini.cross("1", "ett"))
if deterministic:
    final_digit = digit
else:
    final_digit = digits_no_one | both_ones

# Any double digit
graph_tens = teen
if deterministic:
    graph_tens |= ties + (pynutil.delete('0') | graph_digit)
    final_tens = graph_tens
else:
    graph_tens |= ties + (pynutil.delete('0') | (graph_digit | pynutil.insert(' ') + graph_digit))
    final_tens |= ties + (pynutil.delete('0') | (final_digit | pynutil.insert(' ') + final_digit))

hundreds = digits_no_one + pynutil.insert("hundra")
hundreds |= pynini.cross("1", "hundra")
if not deterministic:
    hundreds |= pynini.cross("1", "etthundra")
    hundreds |= pynini.cross("1", "ett hundra")
    hundreds |= digit + pynutil.insert(NEMO_SPACE) + pynutil.insert("hundra")

self_tens = graph_tens.optimize()

graph_two_digit_non_zero = pynini.union(
    graph_digit, graph_tens, (pynutil.delete("0") + graph_digit)
)
if not deterministic:
    graph_two_digit_non_zero |= pynini.union(
        graph_digit, graph_tens, (pynini.cross("0", NEMO_SPACE) + graph_digit)
    )

self_two_digit_non_zero = graph_two_digit_non_zero.optimize()

graph_final_two_digit_non_zero = pynini.union(
    final_digit, graph_tens, (pynutil.delete("0") + final_digit)
)
if not deterministic:
    graph_final_two_digit_non_zero |= pynini.union(
        final_digit, graph_tens, (pynini.cross("0", NEMO_SPACE) + final_digit)
    )

self_final_two_digit_non_zero = graph_final_two_digit_non_zero.optimize()

# Three digit strings
graph_hundreds = hundreds + pynini.union(
    pynutil.delete("00"), graph_tens, (pynutil.delete("0") + final_digit)
)
if not deterministic:
    graph_hundreds |= hundreds + pynini.union(
        pynutil.delete("00"), (graph_tens | pynutil.insert(NEMO_SPACE) + graph_tens), (pynini.cross("0", NEMO_SPACE) + final_digit)
    )

self_hundreds = graph_hundreds.optimize()

graph_hundreds_component = pynini.union(graph_hundreds, pynutil.delete("0") + graph_tens)

graph_hundreds_component_at_least_one_non_zero_digit = graph_hundreds_component | (
    pynutil.delete("00") + graph_digit
)
graph_hundreds_component_at_least_one_non_zero_digit_no_one = graph_hundreds_component | (
    pynutil.delete("00") + digits_no_one
)


tusen = pynutil.insert("tusen")
if not deterministic:
    tusen |= pynutil.insert(" tusen")
    tusen |= pynutil.insert("ettusen")
    tusen |= pynutil.insert(" ettusen")
    tusen |= pynutil.insert("ett tusen")
    tusen |= pynutil.insert(" ett tusen")

graph_thousands_component_at_least_one_non_zero_digit = pynini.union(
    pynutil.delete("000") + graph_hundreds_component_at_least_one_non_zero_digit,
    graph_hundreds_component_at_least_one_non_zero_digit_no_one
    + tusen
    + ((insert_space + graph_hundreds_component_at_least_one_non_zero_digit) | pynutil.delete("000")),
    pynini.cross("001", "tusen")
    + ((insert_space + graph_hundreds_component_at_least_one_non_zero_digit) | pynutil.delete("000")),
)

graph_thousands_component_at_least_one_non_zero_digit_no_one = pynini.union(
    pynutil.delete("000") + graph_hundreds_component_at_least_one_non_zero_digit_no_one,
    graph_hundreds_component_at_least_one_non_zero_digit_no_one
    + tusen
    + ((insert_space + graph_hundreds_component_at_least_one_non_zero_digit) | pynutil.delete("000")),
    pynini.cross("001", "tusen")
    + ((insert_space + graph_hundreds_component_at_least_one_non_zero_digit) | pynutil.delete("000")),
)

graph_million = pynutil.add_weight(pynini.cross("001", "miljon"), -0.001)
if not deterministic:
    graph_million |= pynutil.add_weight(pynini.cross("001", "million"), -0.001)
    graph_million |= pynutil.add_weight(pynini.cross("001", "en miljon"), -0.001)
    graph_million |= pynutil.add_weight(pynini.cross("001", "ett miljon"), -0.001)
    graph_million |= pynutil.add_weight(pynini.cross("001", "en million"), -0.001)
    graph_million |= pynutil.add_weight(pynini.cross("001", "ett million"), -0.001)
graph_million |= graph_hundreds_component_at_least_one_non_zero_digit_no_one + pynutil.insert(" miljoner")
if not deterministic:
    graph_million |= graph_hundreds_component_at_least_one_non_zero_digit_no_one + pynutil.insert(" millioner")
graph_million |= pynutil.delete("000")
graph_million += insert_space

graph_milliard = pynutil.add_weight(pynini.cross("001", "miljard"), -0.001)
if not deterministic:
    graph_milliard |= pynutil.add_weight(pynini.cross("001", "milliard"), -0.001)
    graph_milliard |= pynutil.add_weight(pynini.cross("001", "en miljard"), -0.001)
    graph_milliard |= pynutil.add_weight(pynini.cross("001", "ett miljard"), -0.001)
    graph_milliard |= pynutil.add_weight(pynini.cross("001", "en milliard"), -0.001)
    graph_milliard |= pynutil.add_weight(pynini.cross("001", "ett milliard"), -0.001)
graph_milliard |= graph_hundreds_component_at_least_one_non_zero_digit_no_one + pynutil.insert(" miljarder")
if not deterministic:
    graph_milliard |= graph_hundreds_component_at_least_one_non_zero_digit_no_one + pynutil.insert(" milliarder")
graph_milliard |= pynutil.delete("000")
graph_milliard += insert_space

graph_billion = pynutil.add_weight(pynini.cross("001", "biljon"), -0.001)
if not deterministic:
    graph_billion |= pynutil.add_weight(pynini.cross("001", "billion"), -0.001)
    graph_billion |= pynutil.add_weight(pynini.cross("001", "en biljon"), -0.001)
    graph_billion |= pynutil.add_weight(pynini.cross("001", "ett biljon"), -0.001)
    graph_billion |= pynutil.add_weight(pynini.cross("001", "en billion"), -0.001)
    graph_billion |= pynutil.add_weight(pynini.cross("001", "ett billion"), -0.001)
graph_billion |= graph_hundreds_component_at_least_one_non_zero_digit_no_one + pynutil.insert(" biljoner")
if not deterministic:
    graph_billion |= graph_hundreds_component_at_least_one_non_zero_digit_no_one + pynutil.insert(" billioner")
graph_billion |= pynutil.delete("000")
graph_billion += insert_space

graph_billiard = pynutil.add_weight(pynini.cross("001", "biljard"), -0.001)
if not deterministic:
    graph_billiard |= pynutil.add_weight(pynini.cross("001", "billiard"), -0.001)
    graph_billiard |= pynutil.add_weight(pynini.cross("001", "en biljard"), -0.001)
    graph_billiard |= pynutil.add_weight(pynini.cross("001", "ett biljard"), -0.001)
    graph_billiard |= pynutil.add_weight(pynini.cross("001", "en billiard"), -0.001)
    graph_billiard |= pynutil.add_weight(pynini.cross("001", "ett billiard"), -0.001)
graph_billiard |= graph_hundreds_component_at_least_one_non_zero_digit_no_one + pynutil.insert(" biljarder")
if not deterministic:
    graph_billiard |= graph_hundreds_component_at_least_one_non_zero_digit_no_one + pynutil.insert(" billiarder")
graph_billiard |= pynutil.delete("000")
graph_billiard += insert_space

graph_trillion = pynutil.add_weight(pynini.cross("001", "triljon"), -0.001)
if not deterministic:
    graph_trillion |= pynutil.add_weight(pynini.cross("001", "trillion"), -0.001)
    graph_trillion |= pynutil.add_weight(pynini.cross("001", "en triljon"), -0.001)
    graph_trillion |= pynutil.add_weight(pynini.cross("001", "ett triljon"), -0.001)
    graph_trillion |= pynutil.add_weight(pynini.cross("001", "en trillion"), -0.001)
    graph_trillion |= pynutil.add_weight(pynini.cross("001", "ett trillion"), -0.001)
graph_trillion |= graph_hundreds_component_at_least_one_non_zero_digit_no_one + pynutil.insert(" triljoner")
if not deterministic:
    graph_trillion |= graph_hundreds_component_at_least_one_non_zero_digit_no_one + pynutil.insert(" trillioner")
graph_trillion |= pynutil.delete("000")
graph_trillion += insert_space

graph_trilliard = pynutil.add_weight(pynini.cross("001", "triljard"), -0.001)
if not deterministic:
    graph_trilliard |= pynutil.add_weight(pynini.cross("001", "trilliard"), -0.001)
    graph_trilliard |= pynutil.add_weight(pynini.cross("001", "en triljard"), -0.001)
    graph_trilliard |= pynutil.add_weight(pynini.cross("001", "ett triljard"), -0.001)
    graph_trilliard |= pynutil.add_weight(pynini.cross("001", "en trilliard"), -0.001)
    graph_trilliard |= pynutil.add_weight(pynini.cross("001", "ett trilliard"), -0.001)
graph_trilliard |= graph_hundreds_component_at_least_one_non_zero_digit_no_one + pynutil.insert(" triljarder")
if not deterministic:
    graph_trilliard |= graph_hundreds_component_at_least_one_non_zero_digit_no_one + pynutil.insert(" trilliarder")
graph_trilliard |= pynutil.delete("000")
graph_trilliard += insert_space

graph = (
    graph_trilliard
    + graph_trillion
    + graph_billiard
    + graph_billion
    + graph_milliard
    + graph_million
    + (graph_thousands_component_at_least_one_non_zero_digit | pynutil.delete("000000"))
)

self_graph = (
    ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0))
    @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA)
    @ NEMO_DIGIT ** 24
    @ graph
    @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA)
    @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA)
    @ pynini.cdrewrite(
        pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 2), NEMO_SPACE), NEMO_ALPHA, NEMO_ALPHA, NEMO_SIGMA
    )
)
self_graph |= zero

self_graph = filter_punctuation(self_graph).optimize()

optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)

final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self_graph + pynutil.insert("\"")

final_graph = pynutil.insert("cardinal { ") + final_graph + pynutil.insert(" }")
self_fst = final_graph.optimize()



In [21]:
("111222333444555666777888" @ self_fst).string()

'cardinal { integer: "hundraelva triljarder tvåhundratjugotvå triljoner trehundratrettiotre biljarder fyrahundrafyrtiofyra biljoner femhundrafemtiofem miljarder sexhundrasextiosex miljoner sjuhundrasjuttiosjutusen åttahundraåttioåtta" }'

In [None]:
("hundraett" @ pynini.invert(graph_hundreds_component_at_least_one_non_zero_digit_no_one).optimize()).string()

'101'