In [2]:
import pynini
import string

#### String Tagging

Imagine that we have a collection of texts and we wish to place XML-style tags around any mention of a various types of fine cheese.

Collection of string: ["Boursin", "Camembert", "Cheddar", "Edam", "Gruyere",
                       "Ilchester", "Jarlsberg", "Red Leicester", "Stilton"]

Input : "Do you have Camembert or Edam?"

output = "Do you have < cheese >Camembert</ cheese > or < cheese >Edam</ cheese >?"

In [4]:
input_string = "Do you have Cheddar or Edam?"

cheeses = ("Boursin", "Camembert", "Cheddar", "Edam", "Gruyere",
           "Ilchester", "Jarlsberg", "Red Leicester", "Stilton")

fst_target = pynini.string_map(cheeses)

ltag = pynini.transducer("", "<cheese>")
rtag = pynini.transducer("", "</cheese>")

substitution = ltag + fst_target + rtag

chars = [chr(i) for i in range(1, 91)] + [r"\[", r"\\", r"\]"] + [chr(i) for i in range(94, 256)]
sigma_star = pynini.union(*chars).closure()

rewrite = pynini.cdrewrite(substitution, "", "", sigma_star)
output = pynini.compose(input_string, rewrite).stringify()

In [5]:
print(output)

Do you have <cheese>Cheddar</cheese> or <cheese>Edam</cheese>?


#### Plural Phase

In [8]:
singular_map = pynini.union(
    pynini.transducer("feet", "foot"),
    pynini.transducer("pence", "penny"),

    # Any sequence of bytes ending in "ches" strips the "es";
    # the last argument -1 is a "weight" that gives this analysis a higher priority, if it matches the input.
    sigma_star + pynini.transducer("ches", "ch", -1),

    # Any sequence of bytes ending in "s" strips the "s".
    sigma_star + pynini.transducer("s", "")
)

rc = pynini.union(".", ",", "!", ";", "?", " ", "[EOS]")
singularize = pynini.cdrewrite(singular_map, " 1 ", rc, sigma_star)
singularize.optimize(compute_props=True)

# singularize = pynini.epsnormalize(singularize, eps_norm_output=True)
# singularize = pynini.disambiguate(singularize)
# determ_singularize = pynini.determinize(singularize, det_type="nonfunctional")
# determ_singularize.minimize(allow_nondet=False)

#change your dir path
singularize.draw('/home/rhythm/Data/office dell data/python_codes/pynini/ex2/singularize.dot')
singularize.write('/home/rhythm/Data/office dell data/python_codes/pynini/ex2/singularize.fst')


def _singularize(string):
    return pynini.shortestpath(
        pynini.compose(string.strip(), singularize)).stringify()

In [14]:
print(_singularize("The current temperature in New York is 1 degrees."))

The current temperature in New York is 1 degree.
