Skip to content

Commit

Permalink
It's ALIVE !
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Feb 6, 2020
1 parent cc51728 commit e88eeaf
Showing 1 changed file with 7 additions and 4 deletions.
11 changes: 7 additions & 4 deletions pie_extended/models/fro/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,21 @@ class MemorizingTokenizer(SourceMemorizingTokenizer):
re_sentence_tokenizer = re.compile(r"([_||[^\s\w]]+(?:[\s_||[\W]]+)?)", re.VERSION1)
re_word_tokenizer = re.compile(r"[\s]+")
_sentence_boundaries = re.compile(
r"(([" + _Dots_except_apostrophe + r"]+)\s*)?+"
r"([" + _Dots_except_apostrophe + r"]+\s*)+"
)

def __init__(self):
self.tokens = []

@staticmethod
def _better_replacer(match):
start, end = match.span()
return match.string[start:end] + "<SPLIT>"

@classmethod
def _sentence_tokenizer(cls, string: str) -> List[str]:
string = cls._sentence_boundaries.sub(r"\g<><SPLIT>", string)
string = cls._sentence_boundaries.sub(cls._better_replacer, string)
string = string.replace("_DOT_", ".")
print(string)
return string.split("<SPLIT>")

def word_tokenizer(self, data):
Expand All @@ -48,7 +52,6 @@ def sentence_tokenizer(self, data):
sent = sent.strip()
if sent:
sentences.append(sent)
print(sentences)
yield from sentences

def replacer(self, inp: str):
Expand Down

0 comments on commit e88eeaf

Please sign in to comment.