Skip to content

Commit

Permalink
More bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Feb 6, 2020
1 parent b80ac11 commit 9adf218
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 20 deletions.
10 changes: 1 addition & 9 deletions pie_extended/models/fro/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from ...utils import Metadata, File ,get_path
from .classes import GlueFormatter
from .classes import get_iterator_and_formatter
from ...pipeline.iterators.proto import DataIterator

DESC = Metadata(
Expand All @@ -20,11 +20,3 @@
get_path("fro", "morph.tar"),
get_path("fro", "lemma-pos.tar")
)


def get_iterator_and_formatter():
formatter = GlueFormatter
iterator = DataIterator(
remove_from_input=DataIterator.remove_punctuation
)
return iterator, formatter
16 changes: 5 additions & 11 deletions pie_extended/models/fro/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,16 @@ class MemorizingTokenizer(SourceMemorizingTokenizer):
re_sentence_tokenizer = re.compile(r"([_||[^\s\w]]+(?:[\s_||[\W]]+)?)", re.VERSION1)
re_word_tokenizer = re.compile(r"[\s]+")
_sentence_boundaries = re.compile(
r"(?<!" + _RomanNumber + r"\.)(?<=" + _Dots_collections + r"+)(\B)(?!\." + _RomanNumber + ")"
r"(?<!" + _RomanNumber + r"\.)(?<=" + _Dots_except_apostrophe + r"+)(\B)(?!\." + _RomanNumber + ")"
)

def __init__(self):
self.tokens = []

@classmethod
def _sentence_tokenizer(cls, string: str) -> List[str]:
string = cls._sentence_boundaries.sub("\g<1><SPLIT>", string)
string = cls._sentence_boundaries.sub(r"\g<1><SPLIT>", string)
print(string)
return string.split("<SPLIT>")

def word_tokenizer(self, data):
Expand All @@ -41,16 +42,13 @@ def word_tokenizer(self, data):

def sentence_tokenizer(self, data):
sentences = list()
first_is_dot = False
started_writting = False # Allows for avoiding to compute length
for sent in MemorizingTokenizer.re_sentence_tokenizer.split(data):
sent = sent.strip()
sentences.append(sent)

print(sentences)
yield from sentences

def replacer(self, inp: str):
# inp = inp.replace("U", "V").replace("v", "u").replace("J", "I").replace("j", "i").lower()
inp = self.re_add_space_after_apostrophe.sub("", inp)
return inp

Expand All @@ -70,12 +68,8 @@ class GlueFormatter(SourceGlueFormatter):
NUMBER = re.compile(r"\d+")
PONFORT = [".", "...", "!", "?"]

def __init__(self, tasks: List[str], tokenizer_memory: MemorizingTokenizer):
def __init__(self, tokenizer_memory: MemorizingTokenizer):
super(GlueFormatter, self).__init__(tokenizer_memory=tokenizer_memory)
self.tasks = tasks
self.pos_tag = "POS"
if "POS" not in self.tasks and "pos" in self.tasks:
self.pos_tag = "pos"

def rule_based(cls, token):
if cls.PONCTU.match(token):
Expand Down

0 comments on commit 9adf218

Please sign in to comment.