From 1166fcf74847675e88c20361e4903c4ee7233c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 16:52:29 +0100 Subject: [PATCH] Fro is tested --- pie_extended/models/fro/tokenizer.py | 31 +++++++++++++++------------- tests/test_models/test_fro.py | 9 ++++---- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/pie_extended/models/fro/tokenizer.py b/pie_extended/models/fro/tokenizer.py index 3993587..bac16b6 100644 --- a/pie_extended/models/fro/tokenizer.py +++ b/pie_extended/models/fro/tokenizer.py @@ -14,7 +14,9 @@ class FroMemorizingTokenizer(MemorizingTokenizer): - re_add_space_around_punct = re.compile(r"(\s*)([^\w\s\'’ʼ])(\s*)") + APOSTROPHES = "'’ʼ" + re_elision_apostrophe = re.compile(r"(\w+)([" + APOSTROPHES + r"])(\w+)") + re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)") re_add_space_around_apostrophe_that_are_quotes = re.compile( r"(" r"(((?<=[\W])[\'’ʼ]+(?=[\W]))|" @@ -42,10 +44,11 @@ def _sentence_tokenizer_merge_matches(match): start, end = match.span() return match.string[start:end] + "" - @classmethod - def _real_sentence_tokenizer(cls, string: str) -> List[str]: - string = cls._sentence_boundaries.sub(cls._sentence_tokenizer_merge_matches, string) + def _real_sentence_tokenizer(self, string: str) -> List[str]: + string = self._sentence_boundaries.sub(self._sentence_tokenizer_merge_matches, string) string = string.replace("_DOT_", ".") + for index_apo, apo in enumerate(self.APOSTROPHES): + string = string.replace("ApOsTrOpHe"+str(index_apo), apo+" ") return string.split("") def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]: @@ -63,19 +66,19 @@ def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[s sentences.append(self.word_tokenizer(sent)) yield from sentences + def apostrophe_replace(self, regex_match) -> str: + return regex_match.group(1) + "ApOsTrOpHe"+ str(self.APOSTROPHES.index(regex_match.group(2))) + regex_match.group(3) + def normalizer(self, data: str) -> str: - data = self.re_remove_ending_apostrophe.sub( - r"\g<1> ", - self.re_add_space_around_apostrophe_that_are_quotes.sub( - r" \g<2> ", - self.re_add_space_around_punct.sub( + data = self.re_add_space_around_punct.sub( r" \g<2> ", - self.roman_number_dot.sub( - r"_DOT_\g<1>_DOT_", - data + self.re_elision_apostrophe.sub( + self.apostrophe_replace, + self.roman_number_dot.sub( + r"_DOT_\g<1>_DOT_", + data + ) ) - ) - ) ) return data diff --git a/tests/test_models/test_fro.py b/tests/test_models/test_fro.py index accce3d..9f80f6d 100644 --- a/tests/test_models/test_fro.py +++ b/tests/test_models/test_fro.py @@ -26,12 +26,13 @@ def test_elision_apostrophe(self): self.assertEqual(out[0]["treated"], "q") def test_elision_apostrophe_and_quote(self): - string = "a q'il meurt 'dit il'" - treated = ["a q il meurt dit il"] + string = "'q'il meurt 'dit il'" + treated = ["q il meurt dit il"] tagger, it, pro = make_controller(treated) out = tagger.tag_str(string, it, pro) - self.assertEqual(out[0]["form"], "a") - self.assertEqual(out[0]["treated"], "a") + self.assertEqual(out[0]["form"], "'") + self.assertEqual(out[0]["treated"], "'") self.assertEqual(out[1]["form"], "q'") self.assertEqual(out[1]["treated"], "q") + self.assertEqual(out[-1]["form"], "'", "Last apostrophe is kept") # Ending and starting apostrophe are not reinserted for some reason.