Skip to content

Commit

Permalink
Now we have an issue with missing parenthesis
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Feb 6, 2020
1 parent 9adf218 commit cc51728
Showing 1 changed file with 17 additions and 6 deletions.
23 changes: 17 additions & 6 deletions pie_extended/models/fro/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,16 @@ class MemorizingTokenizer(SourceMemorizingTokenizer):
re_sentence_tokenizer = re.compile(r"([_||[^\s\w]]+(?:[\s_||[\W]]+)?)", re.VERSION1)
re_word_tokenizer = re.compile(r"[\s]+")
_sentence_boundaries = re.compile(
r"(?<!" + _RomanNumber + r"\.)(?<=" + _Dots_except_apostrophe + r"+)(\B)(?!\." + _RomanNumber + ")"
r"(([" + _Dots_except_apostrophe + r"]+)\s*)?+"
)

def __init__(self):
self.tokens = []

@classmethod
def _sentence_tokenizer(cls, string: str) -> List[str]:
string = cls._sentence_boundaries.sub(r"\g<1><SPLIT>", string)
string = cls._sentence_boundaries.sub(r"\g<><SPLIT>", string)
string = string.replace("_DOT_", ".")
print(string)
return string.split("<SPLIT>")

Expand All @@ -42,20 +43,30 @@ def word_tokenizer(self, data):

def sentence_tokenizer(self, data):
sentences = list()
for sent in MemorizingTokenizer.re_sentence_tokenizer.split(data):
data = self.normalizer(data)
for sent in self._sentence_tokenizer(data):
sent = sent.strip()
sentences.append(sent)
if sent:
sentences.append(sent)
print(sentences)
yield from sentences

def replacer(self, inp: str):
inp = self.re_add_space_after_apostrophe.sub("", inp)
return inp

roman_number_dot = re.compile(r"\.(" + _RomanNumber + r")\.")

def normalizer(self, data: str):
data = self.re_add_space_after_apostrophe.sub(
"\g<2> ",
self.re_add_space_around_punct.sub(" \g<2> ", data)
r"\g<2> ",
self.re_add_space_around_punct.sub(
r" \g<2> ",
self.roman_number_dot.sub(
r"_DOT_\g<1>_DOT_",
data
)
)
)
return data

Expand Down

0 comments on commit cc51728

Please sign in to comment.