In [16]:
!pip install pynini



In [31]:
# cldr2fst.py
# Requires: pynini (which depends on OpenFst)
#   pip install pynini==2.1.5
#   (or a version compatible with your platform)
#
# Usage example (see bottom of file):
#   python cldr2fst.py

from __future__ import annotations
import re
from dataclasses import dataclass
from typing import Dict, List, Tuple

import pynini
from pynini import *
from pynini.lib import pynutil
from pynini.lib import utf8


# -------------------------------
# Utilities
# -------------------------------

_U_HEX = re.compile(r"\\u([0-9A-Fa-f]{4})")
_U_HEX_LONG = re.compile(r"\\U([0-9A-Fa-f]{6,8})")

def _decode_escapes(s: str) -> str:
    """Decodes CLDR-style escapes like \\u0259 and \\U0001F600."""
    def rpl4(m):
        return chr(int(m.group(1), 16))
    def rpl8(m):
        return chr(int(m.group(1), 16))
    s = _U_HEX.sub(rpl4, s)
    s = _U_HEX_LONG.sub(rpl8, s)
    # Keep other backslashes as literals (CLDR has many constructs; we stay conservative)
    return s

def _expand_char_class(cls: str) -> List[str]:
    """
    Expand a simple bracket class like [abc] or [a-z].
    This is intentionally minimal; it does not handle nested classes, properties, or set ops.
    """
    out = []
    i = 0
    while i < len(cls):
        if i + 2 < len(cls) and cls[i+1] == "-":
            start = ord(cls[i])
            end = ord(cls[i+2])
            for cp in range(start, end + 1):
                out.append(chr(cp))
            i += 3
        else:
            out.append(cls[i])
            i += 1
    return out

def _tokenize_pattern(pat: str) -> List[str]:
    """
    Tokenize a very small subset of CLDR pattern syntax:
    - Literal characters
    - Character classes [..]
    Returns a list of alternatives (strings) if the pattern is a single char-class;
    otherwise returns [pat] (treated as a literal string).
    """
    pat = pat.strip()
    if len(pat) >= 2 and pat[0] == "[" and pat[-1] == "]":
        inner = _decode_escapes(pat[1:-1])
        alts = _expand_char_class(inner)
        return alts
    # Otherwise we treat it as a literal sequence (already escape-decoded)
    return [_decode_escapes(pat)]

def _string_map(pairs: List[Tuple[str, str]]) -> Fst:
    """
    Build a union of literal string transductions, determinize & minimize.
    """
    # Pynini.string_map can take a dict or list of pairs
    t = pynini.string_map(pairs)
    t.optimize()
    return t


# -------------------------------
# Parsing CLDR rules (subset)
# -------------------------------

_RULE_RE = re.compile(r"""
    ^\s*
    (?P<lhs>.+?)
    \s*
    (?P<op><>|>|<)
    \s*
    (?P<rhs>.+?)
    ;
    \s*$
""", re.VERBOSE)

@dataclass
class Rule:
    lhs: str
    rhs: str
    op: str  # ">", "<", "<>"

def parse_cldr_rules_simple(text: str) -> List[Rule]:
    """
    Parse a subset of CLDR rule lines:
      LHS > RHS ;
      LHS < RHS ;
      LHS <> RHS ;
    Strips comments (# ...) and blank lines.
    Ignores directives (:: ... ;)
    """
    rules: List[Rule] = []
    for raw in text.splitlines():
        line = raw.strip()
        if not line or line.startswith("#"):
            continue
        if line.startswith("::"):
            # ignore directives in this simple converter
            continue
        m = _RULE_RE.match(line)
        if not m:
            # Not supported yet; skip quietly so you can still test quickly.
            continue
        lhs = m.group("lhs").strip()
        rhs = m.group("rhs").strip()
        op = m.group("op")
        rules.append(Rule(lhs=lhs, rhs=rhs, op=op))
    return rules


def build_transducer_from_rules(rules: List[Rule]) -> Fst:
    """
    Build a transducer from simple (context-free) CLDR rules.
    Uses cdrewrite with utf8.VALID_UTF8.closure() so non-matching chars pass through.
    """
    pairs: List[Tuple[str, str]] = []

    for r in rules:
        lhs_alts = _tokenize_pattern(r.lhs)
        rhs_alts = _tokenize_pattern(r.rhs)

        def add_pairs(A: List[str], B: List[str]):
            if len(A) == len(B):
                for x, y in zip(A, B):
                    pairs.append((_decode_escapes(x), _decode_escapes(y)))
            else:
                for x in A:
                    for y in B:
                        pairs.append((_decode_escapes(x), _decode_escapes(y)))

        if r.op == ">":
            add_pairs(lhs_alts, rhs_alts)
        elif r.op == "<":
            add_pairs(rhs_alts, lhs_alts)
        elif r.op == "<>":
            add_pairs(lhs_alts, rhs_alts)
            add_pairs(rhs_alts, lhs_alts)

    sigma_star = utf8.VALID_UTF8_CHAR.closure()

    if not pairs:
        # Identity FST if no rules
        return pynini.cdrewrite(pynini.cross("", ""), "", "", sigma_star)

    # Sequentially compose cdrewrite rules
    t = None
    for src, tgt in pairs:
        rule = pynini.cdrewrite(
            pynini.cross(src, tgt),
            "", "", sigma_star
        )
        t = rule if t is None else (t @ rule)

    t.optimize()
    return t


def prefix_language_token(lang_token: str) -> Fst:
    """
    Create an acceptor for a language prefix token like '<eng>'.
    We allow it as literal at the beginning and then delete it from output:
      '<eng>' x ...  -> ... (so the prefix doesn't appear in the output)
    """
    # Literal token
    token_acceptor = pynini.accep(lang_token)
    # We want to delete it from output: cross(lang_token, "").
    return pynini.cross(lang_token, "")


def build_multilingual_transducer(
    lang_to_rules_text: Dict[str, str],
    token_fmt: str = "<{lang}>",
) -> Fst:
    """
    For each language:
      - parse rules
      - build transducer
      - prepend a required language token (deleted on output)
    Then union all.
    """
    unified = None
    for lang, text in lang_to_rules_text.items():
        rules = parse_cldr_rules_simple(text)
        t = build_transducer_from_rules(rules)
        pref = prefix_language_token(token_fmt.format(lang=lang))
        lang_t = pref + t  # concatenation; prefix must come first
        lang_t.optimize()
        unified = lang_t if unified is None else (unified | lang_t)

    if unified is None:
        # No rules: accept anything and echo it (after removing an imaginary token)
        unified = pynini.transducer("", "")
    unified.optimize()
    # Determinize/minimize for speed
    unified = pynini.determinize(unified).minimize()
    return unified


def save_fst(fst: Fst, path: str) -> None:
    fst.write(path)


# -------------------------------
# Quick demo
# -------------------------------

def demo():
    # Two tiny “CLDR-like” rule sets aiming toward IPA-ish output just for demonstration.
    # (These are *not* authoritative IPA mappings—replace with your real CLDR rules.)
    cldr_en = r"""
        # English-ish toy rules
        th > θ ;
        sh > ʃ ;
        ch > t͡ʃ ;
        ng > ŋ ;
        [aeiou] > a ;   # silly vowel collapse to 'a'
    """

    cldr_es = r"""
        # Spanish-ish toy rules
        ll > ʎ ;
        ñ > ɲ ;
        qu > k ;
        c > k ;
        z > s ;
        [aeiou] > a ;
    """

    lang_rules = {
        "eng": cldr_en,
        "spa": cldr_es,
    }

    fst = build_multilingual_transducer(lang_rules, token_fmt="<{lang}>")
    save_fst(fst, "multilang.fst")

    # Tiny run test: compose an input with the FST and output the best path
    # Example inputs must be prefixed with the language token.
    test_inputs = [
        "<eng>thing",
        "<eng>mashing",
        "<spa>llama",
        "<spa>quiza",
    ]

    def apply(input_str: str) -> str:
        lattice = pynini.compose(input_str, fst)
        if lattice.start() == pynini.NO_STATE_ID:
            return "<no-path>"
        try:
            return pynini.shortestpath(lattice, 1).string()
        except Exception:
            return "<no-output>"

    for s in test_inputs:
        print(s, "->", apply(s))

    print("Wrote combined FST to multilang.fst")

In [32]:
demo()

<eng>thing -> θiŋ
<eng>mashing -> maʃiŋ
<spa>llama -> ʎama
<spa>quiza -> kasa
Wrote combined FST to multilang.fst
