Skip to content

Commit

Permalink
(Fro) Fixed how apostrophe are handle, specificaly when they are quotes
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Feb 10, 2020
1 parent 17b4b57 commit ab4b4fa
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 11 deletions.
37 changes: 29 additions & 8 deletions pie_extended/models/fro/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,14 @@


class MemorizingTokenizer(SourceMemorizingTokenizer):
re_add_space_around_punct = re.compile(r"(\s*)(\.+|[^\w\s\'’ʼ])(\s*)")
re_add_space_around_punct = re.compile(r"(\s*)(\.+[^\w\s\'’ʼ])(\s*)")
re_add_space_around_apostrophe_that_are_quotes = re.compile(
r"((((?<=[\W])[\'’ʼ]+(?=[\W]))|((?<=[\w])[\'’ʼ]+(?=[\W]))|((?<=[\W])[\'’ʼ]+(?=[\w]))))"
# NotLetter+Apo+NotLetter or Letter+Apo+NotLetter or NotLetter+Apo+Letter
# ?'. or manger'_ or _'Bonjour
)
re_add_space_after_apostrophe = re.compile(r"(\s*)([\'’ʼ])(\s*)")
re_remove_ending_apostrophe = re.compile(r"(?<=\w)([\'’ʼ])")
_sentence_boundaries = re.compile(
r"([" + _Dots_except_apostrophe + r"]+\s*)+"
)
Expand Down Expand Up @@ -60,20 +66,34 @@ def _sentence_tokenizer(self, data):
yield from sentences

def _replacer(self, inp: str):
out = self.re_add_space_after_apostrophe.sub("", inp)
out = self.re_remove_ending_apostrophe.sub("", inp)
return out

def _normalizer(self, data: str):
data = self.re_add_space_after_apostrophe.sub(
r"\g<2> ",
self.re_add_space_around_punct.sub(
print(self.re_add_space_around_apostrophe_that_are_quotes.sub(
r" \g<2> ",
self.re_add_space_around_punct.sub(
r" \g<2> ",
self.roman_number_dot.sub(
r"_DOT_\g<1>_DOT_",
data
)
)
))
data = self.re_remove_ending_apostrophe.sub(
r"\g<1> ",
self.re_add_space_around_apostrophe_that_are_quotes.sub(
r" \g<2> ",
self.roman_number_dot.sub(
r"_DOT_\g<1>_DOT_",
data
self.re_add_space_around_punct.sub(
r" \g<2> ",
self.roman_number_dot.sub(
r"_DOT_\g<1>_DOT_",
data
)
)
)
)
print(data)
return data


Expand Down Expand Up @@ -101,6 +121,7 @@ def format_line(self, token, tags, ignored=False):
tags = list(tags)
lemma = tags[self.tasks.index("lemma")]
index, input_token, out_token = self.tokenizer_memory.tokens.pop(0)

if token != out_token:
raise Exception("The output token does not match our inputs %s : %s" % (token, out_token))

Expand Down
4 changes: 2 additions & 2 deletions pie_extended/pipeline/iterators/proto.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import re
import regex as re
import string

from pie.tagger import simple_tokenizer
Expand All @@ -8,7 +8,7 @@
from ...utils import ObjectCreator

Remover = Callable[[List[str]], Tuple[List[str], Dict[int, str]]]
PUNKT = re.compile("^["+string.punctuation+"]+$")
PUNKT = re.compile(r"^\W+$")


class DataIterator:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
URL = 'https://github.com/ponteineptique/nlp-pie-taggers'
AUTHOR = 'Thibault Clérice'
REQUIRES_PYTHON = '>=3.6.0'
VERSION = "0.0.2"
VERSION = "0.0.3"

# What packages are required for this module to be executed?

Expand Down

0 comments on commit ab4b4fa

Please sign in to comment.