Skip to content

Commit

Permalink
Working
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Feb 6, 2020
1 parent 20d53e8 commit 9a7ed93
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 13 deletions.
29 changes: 18 additions & 11 deletions pie_extended/models/fro/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ class MemorizingTokenizer(SourceMemorizingTokenizer):
roman_number_dot = re.compile(r"\.(" + _RomanNumber + r")\.")

def __init__(self):
super(MemorizingTokenizer, self).__init__(
sentence_tokenizer=self._sentence_tokenizer,
word_tokenizer=self._word_tokenizer,
normalizer=self._normalizer
)
self.tokens = []

@staticmethod
Expand All @@ -35,29 +40,30 @@ def _sentence_tokenizer_merge_matches(match):
return match.string[start:end] + "<SPLIT>"

@classmethod
def _sentence_tokenizer(cls, string: str) -> List[str]:
string = cls._sentence_boundaries.sub(cls._better_replacer, string)
def _real_sentence_tokenizer(cls, string: str) -> List[str]:
string = cls._sentence_boundaries.sub(cls._sentence_tokenizer_merge_matches, string)
string = string.replace("_DOT_", ".")
return string.split("<SPLIT>")

def word_tokenizer(self, data):
@staticmethod
def _word_tokenizer(data):
# ICI, il faut que tu tokenizes toi-meme avec une fonction à toi
return data.split()

def sentence_tokenizer(self, data):
def _sentence_tokenizer(self, data):
sentences = list()
data = self.normalizer(data)
for sent in self._sentence_tokenizer(data):
for sent in self._real_sentence_tokenizer(data):
sent = sent.strip()
if sent:
sentences.append(sent)
yield from sentences

def replacer(self, inp: str):
inp = self.re_add_space_after_apostrophe.sub("", inp)
return inp
def _replacer(self, inp: str):
out = self.re_add_space_after_apostrophe.sub("", inp)
return out

def normalizer(self, data: str):
def _normalizer(self, data: str):
data = self.re_add_space_after_apostrophe.sub(
r"\g<2> ",
self.re_add_space_around_punct.sub(
Expand Down Expand Up @@ -98,7 +104,8 @@ def format_line(self, token, tags, ignored=False):
if token != out_token:
raise Exception("The output token does not match our inputs %s : %s" % (token, out_token))

overwriten = self.rule_based(token)
overwriten = self.rule_based(out_token)

if overwriten:
return overwriten

Expand All @@ -108,7 +115,7 @@ def format_line(self, token, tags, ignored=False):
tags[self.tasks.index(self.pos_tag)] = "ADJcar"

return [
token,
input_token,
lemma,
tags[self.tasks.index(self.pos_tag)],
"|".join(
Expand Down
3 changes: 1 addition & 2 deletions pie_extended/pipeline/tokenizers/memorizing.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,14 @@ def __init__(self, sentence_tokenizer=None, word_tokenizer=None, replacer=None,
self.tokens = [
]

self.re_sentence_tokenizer = sentence_tokenizer or self._sentence_tokenizer
self.sentence_tokenizer = sentence_tokenizer or self._sentence_tokenizer
self.word_tokenizer = word_tokenizer or self._word_tokenizer
self.replacer = replacer or self._replacer
self.normalizer = normalizer or self._replacer

def __call__(self, data, lower=True):
if lower:
data = data.lower()

for sentence in self.sentence_tokenizer(data):
toks = self.word_tokenizer(sentence)
new_sentence = []
Expand Down

0 comments on commit 9a7ed93

Please sign in to comment.