Skip to content

Commit

Permalink
Cleaner
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Feb 6, 2020
1 parent e88eeaf commit 20d53e8
Showing 1 changed file with 3 additions and 6 deletions.
9 changes: 3 additions & 6 deletions pie_extended/models/fro/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,17 @@
class MemorizingTokenizer(SourceMemorizingTokenizer):
re_add_space_around_punct = re.compile(r"(\s*)([^\w\s\'’ʼ]+)(\s*)")
re_add_space_after_apostrophe = re.compile(r"(\s*)([\'’ʼ])(\s*)")
re_normalize_space = re.compile(r"(\s+)")
re_sentence_tokenizer = re.compile(r"([_||[^\s\w]]+(?:[\s_||[\W]]+)?)", re.VERSION1)
re_word_tokenizer = re.compile(r"[\s]+")
_sentence_boundaries = re.compile(
r"([" + _Dots_except_apostrophe + r"]+\s*)+"
)
roman_number_dot = re.compile(r"\.(" + _RomanNumber + r")\.")

def __init__(self):
self.tokens = []

@staticmethod
def _better_replacer(match):
def _sentence_tokenizer_merge_matches(match):
""" Best way we found to deal with repeating groups"""
start, end = match.span()
return match.string[start:end] + "<SPLIT>"

Expand All @@ -58,8 +57,6 @@ def replacer(self, inp: str):
inp = self.re_add_space_after_apostrophe.sub("", inp)
return inp

roman_number_dot = re.compile(r"\.(" + _RomanNumber + r")\.")

def normalizer(self, data: str):
data = self.re_add_space_after_apostrophe.sub(
r"\g<2> ",
Expand Down

0 comments on commit 20d53e8

Please sign in to comment.