Skip to content

Commit

Permalink
Fro is tested
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Feb 21, 2020
1 parent 0943043 commit 1166fcf
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 18 deletions.
31 changes: 17 additions & 14 deletions pie_extended/models/fro/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@


class FroMemorizingTokenizer(MemorizingTokenizer):
re_add_space_around_punct = re.compile(r"(\s*)([^\w\s\'’ʼ])(\s*)")
APOSTROPHES = "'’ʼ"
re_elision_apostrophe = re.compile(r"(\w+)([" + APOSTROPHES + r"])(\w+)")
re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)")
re_add_space_around_apostrophe_that_are_quotes = re.compile(
r"("
r"(((?<=[\W])[\'’ʼ]+(?=[\W]))|"
Expand Down Expand Up @@ -42,10 +44,11 @@ def _sentence_tokenizer_merge_matches(match):
start, end = match.span()
return match.string[start:end] + "<SPLIT>"

@classmethod
def _real_sentence_tokenizer(cls, string: str) -> List[str]:
string = cls._sentence_boundaries.sub(cls._sentence_tokenizer_merge_matches, string)
def _real_sentence_tokenizer(self, string: str) -> List[str]:
string = self._sentence_boundaries.sub(self._sentence_tokenizer_merge_matches, string)
string = string.replace("_DOT_", ".")
for index_apo, apo in enumerate(self.APOSTROPHES):
string = string.replace("ApOsTrOpHe"+str(index_apo), apo+" ")
return string.split("<SPLIT>")

def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]:
Expand All @@ -63,19 +66,19 @@ def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[s
sentences.append(self.word_tokenizer(sent))
yield from sentences

def apostrophe_replace(self, regex_match) -> str:
return regex_match.group(1) + "ApOsTrOpHe"+ str(self.APOSTROPHES.index(regex_match.group(2))) + regex_match.group(3)

def normalizer(self, data: str) -> str:
data = self.re_remove_ending_apostrophe.sub(
r"\g<1> ",
self.re_add_space_around_apostrophe_that_are_quotes.sub(
r" \g<2> ",
self.re_add_space_around_punct.sub(
data = self.re_add_space_around_punct.sub(
r" \g<2> ",
self.roman_number_dot.sub(
r"_DOT_\g<1>_DOT_",
data
self.re_elision_apostrophe.sub(
self.apostrophe_replace,
self.roman_number_dot.sub(
r"_DOT_\g<1>_DOT_",
data
)
)
)
)
)
return data

Expand Down
9 changes: 5 additions & 4 deletions tests/test_models/test_fro.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@ def test_elision_apostrophe(self):
self.assertEqual(out[0]["treated"], "q")

def test_elision_apostrophe_and_quote(self):
string = "a q'il meurt 'dit il'"
treated = ["a q il meurt dit il"]
string = "'q'il meurt 'dit il'"
treated = ["q il meurt dit il"]
tagger, it, pro = make_controller(treated)
out = tagger.tag_str(string, it, pro)
self.assertEqual(out[0]["form"], "a")
self.assertEqual(out[0]["treated"], "a")
self.assertEqual(out[0]["form"], "'")
self.assertEqual(out[0]["treated"], "'")
self.assertEqual(out[1]["form"], "q'")
self.assertEqual(out[1]["treated"], "q")
self.assertEqual(out[-1]["form"], "'", "Last apostrophe is kept")
# Ending and starting apostrophe are not reinserted for some reason.

0 comments on commit 1166fcf

Please sign in to comment.