From 9adf218f6d1fd435dd96579ba9b129505c6637ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Thu, 6 Feb 2020 16:42:08 +0100 Subject: [PATCH] More bugs --- pie_extended/models/fro/__init__.py | 10 +--------- pie_extended/models/fro/classes.py | 16 +++++----------- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/pie_extended/models/fro/__init__.py b/pie_extended/models/fro/__init__.py index 72831d8..b2d3ad9 100644 --- a/pie_extended/models/fro/__init__.py +++ b/pie_extended/models/fro/__init__.py @@ -1,5 +1,5 @@ from ...utils import Metadata, File ,get_path -from .classes import GlueFormatter +from .classes import get_iterator_and_formatter from ...pipeline.iterators.proto import DataIterator DESC = Metadata( @@ -20,11 +20,3 @@ get_path("fro", "morph.tar"), get_path("fro", "lemma-pos.tar") ) - - -def get_iterator_and_formatter(): - formatter = GlueFormatter - iterator = DataIterator( - remove_from_input=DataIterator.remove_punctuation - ) - return iterator, formatter diff --git a/pie_extended/models/fro/classes.py b/pie_extended/models/fro/classes.py index b542a94..19146e1 100644 --- a/pie_extended/models/fro/classes.py +++ b/pie_extended/models/fro/classes.py @@ -24,7 +24,7 @@ class MemorizingTokenizer(SourceMemorizingTokenizer): re_sentence_tokenizer = re.compile(r"([_||[^\s\w]]+(?:[\s_||[\W]]+)?)", re.VERSION1) re_word_tokenizer = re.compile(r"[\s]+") _sentence_boundaries = re.compile( - r"(? List[str]: - string = cls._sentence_boundaries.sub("\g<1>", string) + string = cls._sentence_boundaries.sub(r"\g<1>", string) + print(string) return string.split("") def word_tokenizer(self, data): @@ -41,16 +42,13 @@ def word_tokenizer(self, data): def sentence_tokenizer(self, data): sentences = list() - first_is_dot = False - started_writting = False # Allows for avoiding to compute length for sent in MemorizingTokenizer.re_sentence_tokenizer.split(data): sent = sent.strip() sentences.append(sent) - + print(sentences) yield from sentences def replacer(self, inp: str): - # inp = inp.replace("U", "V").replace("v", "u").replace("J", "I").replace("j", "i").lower() inp = self.re_add_space_after_apostrophe.sub("", inp) return inp @@ -70,12 +68,8 @@ class GlueFormatter(SourceGlueFormatter): NUMBER = re.compile(r"\d+") PONFORT = [".", "...", "!", "?"] - def __init__(self, tasks: List[str], tokenizer_memory: MemorizingTokenizer): + def __init__(self, tokenizer_memory: MemorizingTokenizer): super(GlueFormatter, self).__init__(tokenizer_memory=tokenizer_memory) - self.tasks = tasks - self.pos_tag = "POS" - if "POS" not in self.tasks and "pos" in self.tasks: - self.pos_tag = "pos" def rule_based(cls, token): if cls.PONCTU.match(token):