From 5f64babd02271f8b2264bec5208c142c12c196ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Thu, 20 Feb 2020 16:44:16 +0100 Subject: [PATCH 01/14] (Software API) Huge start of a rework to make PostProcessing easier and available outside. Partial answer to #6 --- pie_extended/cli/sub.py | 4 +- pie_extended/models/fro/__init__.py | 4 +- pie_extended/models/fro/classes.py | 79 +++++++------------ pie_extended/models/lasla/__init__.py | 2 +- pie_extended/models/lasla/classes.py | 60 ++++++++------ pie_extended/pipeline/formatters/glue.py | 60 -------------- pie_extended/pipeline/formatters/proto.py | 25 +++++- .../pipeline/postprocessor/__init__.py | 0 .../pipeline/postprocessor/disambiguator.py | 22 ++++++ pie_extended/pipeline/postprocessor/glue.py | 52 ++++++++++++ pie_extended/pipeline/postprocessor/memory.py | 29 +++++++ pie_extended/pipeline/postprocessor/proto.py | 79 +++++++++++++++++++ .../pipeline/postprocessor/rulebased.py | 21 +++++ pie_extended/tagger.py | 56 ++++++------- tests/test_models/test_lasla.py | 10 +-- 15 files changed, 318 insertions(+), 185 deletions(-) delete mode 100644 pie_extended/pipeline/formatters/glue.py create mode 100644 pie_extended/pipeline/postprocessor/__init__.py create mode 100644 pie_extended/pipeline/postprocessor/disambiguator.py create mode 100644 pie_extended/pipeline/postprocessor/glue.py create mode 100644 pie_extended/pipeline/postprocessor/memory.py create mode 100644 pie_extended/pipeline/postprocessor/proto.py create mode 100644 pie_extended/pipeline/postprocessor/rulebased.py diff --git a/pie_extended/cli/sub.py b/pie_extended/cli/sub.py index 3f1f917..13c2e56 100644 --- a/pie_extended/cli/sub.py +++ b/pie_extended/cli/sub.py @@ -50,8 +50,8 @@ def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None) def tag_file(model: str, tagger: ExtensibleTagger, fpath): module = get_model(model) - iterator, formatter = getattr(module, "get_iterator_and_formatter")() - tagger.tag_file(fpath, iterator=iterator, formatter_class=formatter) + iterator, processor = getattr(module, "get_iterator_and_processor")() + tagger.tag_file(fpath, iterator=iterator, processor=processor) return True diff --git a/pie_extended/models/fro/__init__.py b/pie_extended/models/fro/__init__.py index b2d3ad9..63da07b 100644 --- a/pie_extended/models/fro/__init__.py +++ b/pie_extended/models/fro/__init__.py @@ -1,5 +1,5 @@ -from ...utils import Metadata, File ,get_path -from .classes import get_iterator_and_formatter +from ...utils import Metadata, File, get_path +from .classes import get_iterator_and_processor from ...pipeline.iterators.proto import DataIterator DESC = Metadata( diff --git a/pie_extended/models/fro/classes.py b/pie_extended/models/fro/classes.py index 99fff84..01abff4 100644 --- a/pie_extended/models/fro/classes.py +++ b/pie_extended/models/fro/classes.py @@ -1,8 +1,11 @@ import regex as re -from typing import List -from ...pipeline.formatters.glue import GlueFormatter as SourceGlueFormatter +from typing import List, Dict from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer as SourceMemorizingTokenizer from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.postprocessor.disambiguator import DisambiguatorProcessor +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor +from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor +from pie_extended.pipeline.postprocessor.glue import GlueProcessor # Uppercase regexp _uppercase = re.compile("^[A-ZÉÈÀÂÊÎÔÛŶÄËÏÖÜŸ]$") @@ -86,67 +89,41 @@ def _normalizer(self, data: str): return data -class GlueFormatter(SourceGlueFormatter): - HEADERS = ["form", "lemma", "POS", "morph", "treated_token"] - MORPH_PART = ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"] - +class FroRulesProcessor(RuleBasedProcessor): PONCTU = re.compile(r"^\W+$") NUMBER = re.compile(r"\d+") PONFORT = [".", "...", "!", "?"] - def __init__(self, tokenizer_memory: MemorizingTokenizer): - super(GlueFormatter, self).__init__(tokenizer_memory=tokenizer_memory) - - def rule_based(cls, token): - if cls.PONCTU.match(token): - lemma = token - if token in GlueFormatter.PONFORT: + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + token = annotation["form"] + if self.PONCTU.match(token): + if token in self.PONFORT: pos = "PONfrt" else: pos = "PONfbl" - return [token, lemma, pos, "MORPH=empty", token] - - def format_line(self, token, tags, ignored=False): - tags = list(tags) - lemma = tags[self.tasks.index("lemma")] - index, input_token, out_token = self.tokenizer_memory.tokens.pop(0) - - if token != out_token: - raise Exception("The output token does not match our inputs %s : %s" % (token, out_token)) - - overwriten = self.rule_based(out_token) - - if overwriten: - return overwriten - - if type(self).NUMBER.match(token): # This would push for sending the whole elements to rule_based and - # not the token only - lemma = token - tags[self.tasks.index(self.pos_tag)] = "ADJcar" - - return [ - input_token, - lemma, - tags[self.tasks.index(self.pos_tag)], - "|".join( - "{cat}={tag}".format( - cat=morph_part, - tag=tags[self.tasks.index(morph_part.replace(".", ""))] - ) - for morph_part in GlueFormatter.MORPH_PART - if morph_part.replace(".", "") in self.tasks and - tags[self.tasks.index(morph_part.replace(".", ""))] != "_" - ) or "MORPH=empty", - out_token - ] + return {"form": token, "lemma": token, "POS": pos, "morph": "MORPH=empty"} + elif self.NUMBER.match(token): + annotation["pos"] = "ADJcar" + return annotation + +class FroGlueProcessor(GlueProcessor): + OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] + GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]} + MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."} -def get_iterator_and_formatter(): + +def get_iterator_and_processor(): tokenizer = MemorizingTokenizer() - formatter = GlueFormatter(tokenizer) + processor = FroRulesProcessor( + MemoryzingProcessor( + tokenizer_memory=tokenizer, + head_processor=FroGlueProcessor() + ) + ) iterator = DataIterator( tokenizer=tokenizer, remove_from_input=DataIterator.remove_punctuation ) - return iterator, formatter + return iterator, processor diff --git a/pie_extended/models/lasla/__init__.py b/pie_extended/models/lasla/__init__.py index 05f68de..7074413 100644 --- a/pie_extended/models/lasla/__init__.py +++ b/pie_extended/models/lasla/__init__.py @@ -1,2 +1,2 @@ from pie_extended.models.lasla.consts import DOWNLOADS, Models, Disambiguator, addons, DESC -from pie_extended.models.lasla.classes import get_iterator_and_formatter +from pie_extended.models.lasla.classes import get_iterator_and_processor diff --git a/pie_extended/models/lasla/classes.py b/pie_extended/models/lasla/classes.py index 7276346..3599be8 100644 --- a/pie_extended/models/lasla/classes.py +++ b/pie_extended/models/lasla/classes.py @@ -13,10 +13,35 @@ from pie_extended.pipeline.iterators.proto import DataIterator -from pie_extended.pipeline.formatters.glue import GlueFormatter as SourceGlueFormatter +from pie_extended.pipeline.postprocessor.disambiguator import DisambiguatorProcessor +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor +from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor +from pie_extended.pipeline.postprocessor.glue import GlueProcessor from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer as SourceMemorizingTokenizer +from typing import Dict +class LatinRulesProcessor(RuleBasedProcessor): + PONCTU = re.compile(r"^\W+$") + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + # If Else condition + token = annotation["form"] + if self.PONCTU.match(token): + return {"form": token, "lemma": token, "POS": "PUNC", "morph": "MORPH=empty"} + elif token.startswith("-"): + if token == "-ne": + annotation["lemma"] = "ne2" + else: + annotation["lemma"] = "ne" + return annotation + + +class LatinGlueProcessor(GlueProcessor): + OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] + GLUE = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]} + WHEN_EMPTY = {"morph": "MORPH=empty"} + # Uppercase regexp uppercase = re.compile(r"^[A-Z]$") @@ -74,33 +99,16 @@ def normalizer(self, data: str): return data -class GlueFormatter(SourceGlueFormatter): - HEADERS = ["form", "lemma", "POS", "morph", "treated_token"] - MORPH_PART = ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"] - PONCTU = re.compile(r"^\W+$") - - def __init__(self, tokenizer_memory): - super(GlueFormatter, self).__init__([]) - self.tokenizer_memory = tokenizer_memory - - def rule_based(cls, token): - if cls.PONCTU.match(token): - return [token, token, "PUNC", "MORPH=empty", token] - elif token.startswith("-"): - if token == "-ne": - lemma = "ne2" - else: - lemma = token[1:] - return [token, lemma, "CONcoo", "MORPH=empty", token] - - return None - - -def get_iterator_and_formatter(): +def get_iterator_and_processor(): tokenizer = MemorizingTokenizer() - formatter = GlueFormatter(tokenizer) + processor = LatinRulesProcessor( + MemoryzingProcessor( + tokenizer_memory=tokenizer, + head_processor=LatinGlueProcessor() + ) + ) iterator = DataIterator( tokenizer=tokenizer, remove_from_input=DataIterator.remove_punctuation ) - return iterator, formatter + return iterator, processor diff --git a/pie_extended/pipeline/formatters/glue.py b/pie_extended/pipeline/formatters/glue.py deleted file mode 100644 index b1024dd..0000000 --- a/pie_extended/pipeline/formatters/glue.py +++ /dev/null @@ -1,60 +0,0 @@ -import regex as re -from .proto import Formatter - - -class GlueFormatter(Formatter): - """ Need replacing of morph_part for specific corpora - - """ - - HEADERS = ["form", "lemma", "POS", "morph", "treated_token"] - MORPH_PART = ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"] - PONCTU = re.compile(r"^\W+$") - - def __init__(self, tokenizer_memory): - super(GlueFormatter, self).__init__([]) - self.tokenizer_memory = tokenizer_memory - - def __call__(self, tasks): - super(GlueFormatter, self).__init__(tasks) - self.pos_tag = "POS" - if "POS" not in self.tasks and "pos" in self.tasks: - self.pos_tag = "pos" - return self - - @classmethod - def get_headers(cls): - return cls.HEADERS - - def rule_based(cls, token): - if cls.PONCTU.match(token): - return [token, token, "PUNC", "MORPH=empty", token] - - return None - - def format_line(self, token, tags, ignored=False): - tags = list(tags) - lemma = tags[self.tasks.index("lemma")] - index, input_token, out_token = self.tokenizer_memory.tokens.pop(0) - if token != out_token: - raise Exception("The output token does not match our inputs %s : %s" % (token, out_token)) - - overwriten = self.rule_based(token) - if overwriten: - return overwriten - - return [ - input_token, - lemma, - tags[self.tasks.index(self.pos_tag)], - "|".join( - "{cat}={tag}".format( - cat=morph_part, - tag=tags[self.tasks.index(morph_part)] - ) - for morph_part in type(self).MORPH_PART - if morph_part in self.tasks and - tags[self.tasks.index(morph_part)] != "_" - ) or "MORPH=empty", - out_token - ] diff --git a/pie_extended/pipeline/formatters/proto.py b/pie_extended/pipeline/formatters/proto.py index f4b2c2d..ac69acc 100644 --- a/pie_extended/pipeline/formatters/proto.py +++ b/pie_extended/pipeline/formatters/proto.py @@ -1,13 +1,30 @@ -from typing import List, Iterable +from typing import List, Iterable, Callable, Dict +import sys class Formatter: # Default is TSV + """ The CSV formatter necessarily starts with form in its header. + + """ + format_line: Callable[[Dict[str, str]], List[str]] + def __init__(self, tasks: List[str]): self.tasks: List[str] = tasks - def format_line(self, token: str, tags: Iterable[str], ignored=False) -> List[str]: - """ Format the tags""" - return [token] + list(tags) + if sys.version_info.minor <= 6: + # Before 3.7, order of dictionary is not guaranteed + # Cf. https://mail.python.org/pipermail/python-dev/2017-December/151283.html + self.format_line = self.format_line_3_6 + else: + self.format_line = self.format_line_3_7 + + def format_line_3_6(self, annotation: Dict[str, str]) -> List[str]: + """ Format the tags """ + return [annotation["form"]] + [annotation[task] for task in self.tasks] + + def format_line_3_7(self, annotation: Dict[str, str]) -> List[str]: + """ Format the tags """ + return list(annotation.values()) def write_line(self, formatted): return "\t".join(formatted) + "\r\n" diff --git a/pie_extended/pipeline/postprocessor/__init__.py b/pie_extended/pipeline/postprocessor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pie_extended/pipeline/postprocessor/disambiguator.py b/pie_extended/pipeline/postprocessor/disambiguator.py new file mode 100644 index 0000000..79d0895 --- /dev/null +++ b/pie_extended/pipeline/postprocessor/disambiguator.py @@ -0,0 +1,22 @@ +from ..disambiguators.proto import Disambiguator +from .proto import ProcessorPrototype, ChainedProcessor +from typing import Optional, Dict, List + + +# Right now disambiguation is applied at the sentence level. Question is should we ? +# Keeping that here for the moment + +class DisambiguatorProcessor(ChainedProcessor): + """ Applies rules found in rules(token_annotation) + + """ + + def __init__(self, disambiguator: Disambiguator, head_processor: Optional[ProcessorPrototype], **kwargs): + super(DisambiguatorProcessor, self).__init__(head_processor=head_processor, **kwargs) + self.disambiguator: Disambiguator = disambiguator + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + return annotation + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + return self.rules(self.head_processor.get_dict(token, tags)) \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/glue.py b/pie_extended/pipeline/postprocessor/glue.py new file mode 100644 index 0000000..171d406 --- /dev/null +++ b/pie_extended/pipeline/postprocessor/glue.py @@ -0,0 +1,52 @@ +from .proto import ProcessorPrototype, RenamedTaskProcessor +from typing import Generator, Dict, List + + +class GlueProcessor(RenamedTaskProcessor): + """ Glues together specific tasks + + """ + + # Output keys are keys that are given in the end + OUTPUT_KEYS: List[str] = ["form", "lemma", "POS", "morph"] + # Glue dicts contains tasks that should merge together subtasks + GLUE: Dict[str, List[str]] = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]} + # Glue_char is what is used to glue things together -> Tense=Pres|Person=1 + GLUE_CHAR: str = "|" + # Glue Empty are value to take when all things glued together are empty + GLUE_EMPTY: Dict[str, str] = {"morph": "MORPH=empty"} + + def __init__(self): + super(GlueProcessor, self).__init__() + + # Sets-up some copy of the values + self._out = type(self).OUTPUT_KEYS + self._glue = type(self).GLUE + self._glue_char = type(self).GLUE_CHAR + self._glue_empty = type(self).GLUE_EMPTY + + def set_tasks(self, tasks): + super(GlueProcessor, self).set_tasks(tasks) + + def _yield_annotation( + self, + token_dict: Dict[str, str] + ) -> Generator[str, None, None]: + # For each key we should return + print(self.tasks) + for head in self._out: + if head not in self._glue: + yield head, token_dict[head] + else: + # Otherwise, we glue together things that should be glued together + joined = self._glue_char.join([token_dict[glued_task] for glued_task in self._glue[head]]) + if not joined: + joined = self._glue_empty[head] + yield head, joined + + def reinsert(self, form: str) -> Dict[str, str]: + return dict(form=form, **{key: self.empty_value for key in self._out if key != "form"}) + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + as_dict = super(GlueProcessor, self).get_dict(token, tags) + return dict(self._yield_annotation(as_dict)) diff --git a/pie_extended/pipeline/postprocessor/memory.py b/pie_extended/pipeline/postprocessor/memory.py new file mode 100644 index 0000000..83d7801 --- /dev/null +++ b/pie_extended/pipeline/postprocessor/memory.py @@ -0,0 +1,29 @@ +from .proto import ProcessorPrototype, ChainedProcessor +from typing import Optional, Dict, List +if "typing" == "nottyping": + from ..tokenizers.memorizing import MemorizingTokenizer + + +class MemoryzingProcessor(ChainedProcessor): + """ MemoryzingProcessor proposes to keep track of changes operated on input string + by reinserting the original data alongside a new task (KEY) where we output + the input seen by the Model + + """ + KEY: str = "treated" + + def __init__(self, tokenizer_memory: "MemorizingTokenizer", head_processor: Optional[ProcessorPrototype], **kwargs): + super(MemoryzingProcessor, self).__init__(head_processor=head_processor, **kwargs) + self.memory: "MemorizingTokenizer" = tokenizer_memory + self._key: str = type(self).KEY + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + # First we get the dictionary + token_dict = self.head_processor.get_dict(token, tags) + index, input_token, out_token = self.memory.tokens.pop(0) + if token != out_token: + raise Exception("The output token does not match our inputs %s : %s" % (token, out_token)) + + token_dict[self._key] = out_token + token_dict["form"] = input_token + return token_dict \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/proto.py b/pie_extended/pipeline/postprocessor/proto.py new file mode 100644 index 0000000..dd1aefb --- /dev/null +++ b/pie_extended/pipeline/postprocessor/proto.py @@ -0,0 +1,79 @@ +from typing import List, Dict, Optional, Type + +DEFAULT_EMPTY = "_" + + +class ProcessorPrototype: + tasks: List[str] + empty_value: str + + def __init__(self, empty_value: Optional[str] = None): + self.tasks = [] + self.empty_value = empty_value or DEFAULT_EMPTY + + def set_tasks(self, tasks): + self.tasks = tasks + + def postprocess(self, line): + pass + + def reinsert(self, form: str) -> Dict[str, str]: + """ Generates an automatic line for a token that was removed from lemmatization + + :param form: Token to reinsert + :return: Dictionary representation of the token, as an annotation + """ + return dict(form=form, **{task: self.empty_value for task in self.tasks}) + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + """ Get the dictionary representation of a token annotation + + :param token: + :param tags: + :return: + """ + return dict(form=token, **dict(zip(self.tasks, tags))) + + def reset(self): + """ Functions that should be run in between documents """ + pass + + +class RenamedTaskProcessor(ProcessorPrototype): + MAP: Dict[str, str] + + def __init__(self, **kwargs): + super(RenamedTaskProcessor, self).__init__(**kwargs) + self._map: Dict[str, str] = type(self).MAP + + def set_tasks(self, tasks): + return [self._map.get(task, task) for task in tasks] + + +class ChainedProcessor(ProcessorPrototype): + """ Allows for easy chaining ! + + ChainedProcessor(ProcessorPrototype) basically should behave like a normal processor + + """ + head_processor: ProcessorPrototype + + def __init__(self, head_processor: Optional[ProcessorPrototype], **kwargs): + super(ChainedProcessor, self).__init__(**kwargs) + + self.head_processor: ProcessorPrototype = head_processor + if not self.head_processor: + self.head_processor = ProcessorPrototype() + + def set_tasks(self, tasks): + super(ChainedProcessor, self).set_tasks(tasks) + self.head_processor.set_tasks(tasks) + + def reinsert(self, form: str) -> Dict[str, str]: + return self.head_processor.reinsert(form) + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + return self.head_processor.get_dict(token, tags) + + def reset(self): + self.head_processor.reset() \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/rulebased.py b/pie_extended/pipeline/postprocessor/rulebased.py new file mode 100644 index 0000000..279f97e --- /dev/null +++ b/pie_extended/pipeline/postprocessor/rulebased.py @@ -0,0 +1,21 @@ +from .proto import ProcessorPrototype, ChainedProcessor +from typing import Optional, Dict, List +if "typing" == "nottyping": + from ..tokenizers.memorizing import MemorizingTokenizer + + +class RuleBasedProcessor(ChainedProcessor): + """ Applies rules found in rules(token_annotation) + + """ + KEY: str = "treated" + + def __init__(self, head_processor: Optional[ProcessorPrototype], **kwargs): + super(RuleBasedProcessor, self).__init__(head_processor=head_processor, **kwargs) + self._key: str = type(self).KEY + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + return annotation + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + return self.rules(self.head_processor.get_dict(token, tags)) \ No newline at end of file diff --git a/pie_extended/tagger.py b/pie_extended/tagger.py index 8c2dc9d..9be9a01 100644 --- a/pie_extended/tagger.py +++ b/pie_extended/tagger.py @@ -1,5 +1,5 @@ import os -from typing import Optional +from typing import Optional, Dict, Generator, Type from pie.tagger import Tagger from pie import utils @@ -7,6 +7,7 @@ from .pipeline.formatters.proto import Formatter from .pipeline.disambiguators.proto import Disambiguator from .pipeline.iterators.proto import DataIterator +from .pipeline.postprocessor.proto import ProcessorPrototype class ExtensibleTagger(Tagger): @@ -30,7 +31,7 @@ def reinsert_full(self, formatter, sent_reinsertion, tasks): ) yield formatter.write_sentence_end() - def tag_file(self, fpath: str, iterator: DataIterator, formatter_class: type): + def tag_file(self, fpath: str, iterator: DataIterator, processor: ProcessorPrototype): # Read content of the file with open(fpath) as f: data = f.read() @@ -38,16 +39,15 @@ def tag_file(self, fpath: str, iterator: DataIterator, formatter_class: type): _, ext = os.path.splitext(fpath) with open(utils.ensure_ext(fpath, ext, 'pie'), 'w+') as f: - for line in self.iter_tag(data, iterator, formatter_class): + for line in self.iter_tag(data, iterator, processor=processor): f.write(line) - def tag_str(self, data: str, iterator: DataIterator, formatter_class: type) -> str: - return "".join(list(self.iter_tag(data, iterator, formatter_class))) - - def iter_tag(self, data: str, iterator: DataIterator, formatter_class: type): - header = False - formatter = None + def tag_str(self, data: str, iterator: DataIterator, processor: ProcessorPrototype) -> str: + return list(self.iter_tag_token(data, iterator, processor=processor)) + def iter_tag_token(self, data: str, iterator: DataIterator, processor: ProcessorPrototype) \ + -> Generator[Dict[str, str], None, None]: + processor.reset() for chunk in utils.chunks( iterator(data, lower=self.lower), size=self.batch_size): @@ -61,7 +61,8 @@ def iter_tag(self, data: str, iterator: DataIterator, formatter_class: type): sents=[sent for sent in sents if sent], lengths=lengths ) - formatter: Formatter = formatter_class(tasks) + if not processor.tasks: + processor.set_tasks(tasks) # We keep a real sentence index for sents_index, sent_is_empty in enumerate(is_empty): @@ -73,13 +74,6 @@ def iter_tag(self, data: str, iterator: DataIterator, formatter_class: type): # Gets things that needs to be reinserted sent_reinsertion = needs_reinsertion[sents_index] - # If the header has not yet be written, write it - if not header: - yield formatter.write_headers() - header = True - - yield formatter.write_sentence_beginning() - # If we have a disambiguator, we run the results into it if self.disambiguation: sent = self.disambiguation(sent, tasks) @@ -88,29 +82,23 @@ def iter_tag(self, data: str, iterator: DataIterator, formatter_class: type): for index, (token, tags) in enumerate(sent): while reinsertion_index + index in sent_reinsertion: - yield formatter.write_line( - formatter.format_line( - token=sent_reinsertion[reinsertion_index + index], - tags=[""] * len(tasks) - ) - ) + yield processor.reinsert(sent_reinsertion[reinsertion_index+index]) del sent_reinsertion[reinsertion_index + index] reinsertion_index += 1 - yield formatter.write_line( - formatter.format_line(token, tags) - ) + yield processor.get_dict(token, tags) for reinsertion in sorted(list(sent_reinsertion.keys())): - yield formatter.write_line( - formatter.format_line( - token=sent_reinsertion[reinsertion], - tags=[""] * len(tasks) - ) - ) + yield processor.reinsert(sent_reinsertion[reinsertion]) - yield formatter.write_sentence_end() + def iter_tag(self, data: str, iterator: DataIterator, processor: type): + formatter = None + for annotation in self.iter_tag_token(data, iterator, processor): + if not formatter: + formatter = Formatter(list(annotation.keys())) + yield formatter.write_headers() + yield formatter.write_line(formatter) if formatter: - yield formatter.write_footer() + yield formatter.write_footer() \ No newline at end of file diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index 2a5cac5..5527508 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -40,7 +40,7 @@ def test_consecutive_dots(self): result = tagger.tag_str( data="id enim ait turbabuntur a facie eius patris or phanorum et iudicis uiduarum ." " . causam turbationis hanc docuit quod pater", - formatter_class=formatter, + postprocessing_class=formatter, iterator=data_iterator ) self.assertIn( @@ -62,7 +62,7 @@ def test_leading_punctuation(self): ]) result = tagger.tag_str( "( id enim ait ) turbabuntur a facie eius patris or phanorum et iudicis uiduarum . .", - formatter_class=formatter, + postprocessing_class=formatter, iterator=data_iterator ) self.assertIn( @@ -87,7 +87,7 @@ def test_punctuation_is_not_seen(self): ]) tagger.tag_str( "( id enim ait ) turbabuntur a facie eius patris or phanorum et iudicis uiduarum . .", - formatter_class=formatter, + postprocessing_class=formatter, iterator=data_iterator ) self.assertNotIn( @@ -105,7 +105,7 @@ def test_j_are_temporarly_replaced(self): ]) result = tagger.tag_str( "( id enim ait ) turbabuntur a facie eius patris or phanorum et judicis uiduarum . .", - formatter_class=formatter, + postprocessing_class=formatter, iterator=data_iterator ) flatten_seen = list([tok for sent in tagger.seen for tok in sent]) @@ -121,7 +121,7 @@ def test_underscores(self): ]) tagger.tag_str( string, - formatter_class=formatter, + postprocessing_class=formatter, iterator=data_iterator ) flatten_seen = list([tok for sent in tagger.seen for tok in sent]) From 29e8e4eaf2204a41b22746f0380b1083b80c4e80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Thu, 20 Feb 2020 16:54:22 +0100 Subject: [PATCH 02/14] Where are my tasks gone --- pie_extended/pipeline/postprocessor/proto.py | 5 ++++- pie_extended/tagger.py | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pie_extended/pipeline/postprocessor/proto.py b/pie_extended/pipeline/postprocessor/proto.py index dd1aefb..4a615f1 100644 --- a/pie_extended/pipeline/postprocessor/proto.py +++ b/pie_extended/pipeline/postprocessor/proto.py @@ -13,6 +13,7 @@ def __init__(self, empty_value: Optional[str] = None): def set_tasks(self, tasks): self.tasks = tasks + print(tasks, self.tasks) def postprocess(self, line): pass @@ -32,7 +33,9 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: :param tags: :return: """ - return dict(form=token, **dict(zip(self.tasks, tags))) + print("Do I have task ?", self.tasks) + print({"form":token, **{k: val for k, val in zip(self.tasks, tags)}}) + return {"form":token, **{k: val for k, val in zip(self.tasks, tags)}} def reset(self): """ Functions that should be run in between documents """ diff --git a/pie_extended/tagger.py b/pie_extended/tagger.py index 9be9a01..46741e9 100644 --- a/pie_extended/tagger.py +++ b/pie_extended/tagger.py @@ -61,8 +61,7 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor sents=[sent for sent in sents if sent], lengths=lengths ) - if not processor.tasks: - processor.set_tasks(tasks) + processor.set_tasks(tasks) # We keep a real sentence index for sents_index, sent_is_empty in enumerate(is_empty): From 51734a3e6ec2d4808b65f5a0cb6d45c1b4acc269 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Thu, 20 Feb 2020 19:37:52 +0100 Subject: [PATCH 03/14] (Architecture Rework) Completely reworked tokenizer --- pie_extended/models/fro/classes.py | 50 +++---- pie_extended/models/lasla/classes.py | 131 +++++++++++------- pie_extended/pipeline/iterators/proto.py | 16 +-- pie_extended/pipeline/postprocessor/glue.py | 5 +- pie_extended/pipeline/postprocessor/memory.py | 6 +- pie_extended/pipeline/postprocessor/proto.py | 9 +- .../pipeline/tokenizers/memorizing.py | 67 ++++----- .../pipeline/tokenizers/simple_tokenizer.py | 33 +++++ pie_extended/tagger.py | 9 +- tests/test_models/test_lasla.py | 26 ++-- 10 files changed, 201 insertions(+), 151 deletions(-) create mode 100644 pie_extended/pipeline/tokenizers/simple_tokenizer.py diff --git a/pie_extended/models/fro/classes.py b/pie_extended/models/fro/classes.py index 01abff4..482b696 100644 --- a/pie_extended/models/fro/classes.py +++ b/pie_extended/models/fro/classes.py @@ -1,14 +1,11 @@ import regex as re -from typing import List, Dict -from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer as SourceMemorizingTokenizer +from typing import List, Dict, Generator +from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer from pie_extended.pipeline.iterators.proto import DataIterator -from pie_extended.pipeline.postprocessor.disambiguator import DisambiguatorProcessor from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor from pie_extended.pipeline.postprocessor.glue import GlueProcessor -# Uppercase regexp -_uppercase = re.compile("^[A-ZÉÈÀÂÊÎÔÛŶÄËÏÖÜŸ]$") _Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“" _Dots_collections = r"[" + _Dots_except_apostrophe + "‘’]" @@ -20,7 +17,7 @@ r"(?:XC|XL|L?X{0,3})(?:IX|I?V|V?I{1,3}))" -class MemorizingTokenizer(SourceMemorizingTokenizer): +class FroMemorizingTokenizer(MemorizingTokenizer): re_add_space_around_punct = re.compile(r"(\s*)(\.+[^\w\s\'’ʼ])(\s*)") re_add_space_around_apostrophe_that_are_quotes = re.compile( r"((((?<=[\W])[\'’ʼ]+(?=[\W]))|((?<=[\w])[\'’ʼ]+(?=[\W]))|((?<=[\W])[\'’ʼ]+(?=[\w]))))" @@ -35,11 +32,7 @@ class MemorizingTokenizer(SourceMemorizingTokenizer): roman_number_dot = re.compile(r"\.(" + _RomanNumber + r")\.") def __init__(self): - super(MemorizingTokenizer, self).__init__( - sentence_tokenizer=self._sentence_tokenizer, - word_tokenizer=self._word_tokenizer, - normalizer=self._normalizer - ) + super(FroMemorizingTokenizer, self).__init__() self.tokens = [] @staticmethod @@ -54,25 +47,22 @@ def _real_sentence_tokenizer(cls, string: str) -> List[str]: string = string.replace("_DOT_", ".") return string.split("") - @staticmethod - def _word_tokenizer(data): - # ICI, il faut que tu tokenizes toi-meme avec une fonction à toi - return data.split() + def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + if lower: + text = text.lower() + text = text.split() + return text - def _sentence_tokenizer(self, data): + def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]: sentences = list() - data = self.normalizer(data) + data = self.normalizer(text) for sent in self._real_sentence_tokenizer(data): sent = sent.strip() if sent: - sentences.append(sent) + sentences.append(self.word_tokenizer(sent)) yield from sentences - def _replacer(self, inp: str): - out = self.re_remove_ending_apostrophe.sub("", inp) - return out - - def _normalizer(self, data: str): + def normalizer(self, data: str) -> str: data = self.re_remove_ending_apostrophe.sub( r"\g<1> ", self.re_add_space_around_apostrophe_that_are_quotes.sub( @@ -90,6 +80,9 @@ def _normalizer(self, data: str): class FroRulesProcessor(RuleBasedProcessor): + """ Fro Dataset has not all punctuation signs in it, we remove it and posttag it automatically + + """ PONCTU = re.compile(r"^\W+$") NUMBER = re.compile(r"\d+") PONFORT = [".", "...", "!", "?"] @@ -106,15 +99,24 @@ def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: annotation["pos"] = "ADJcar" return annotation + def __init__(self, *args, **kwargs): + super(FroRulesProcessor, self).__init__(*args, **kwargs) + class FroGlueProcessor(GlueProcessor): + """ We glue morphological features into one column + + """ OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]} MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."} + def __init__(self, *args, **kwargs): + super(FroGlueProcessor, self).__init__(*args, **kwargs) + def get_iterator_and_processor(): - tokenizer = MemorizingTokenizer() + tokenizer = FroMemorizingTokenizer() processor = FroRulesProcessor( MemoryzingProcessor( tokenizer_memory=tokenizer, diff --git a/pie_extended/models/lasla/classes.py b/pie_extended/models/lasla/classes.py index 3599be8..fd0ad69 100644 --- a/pie_extended/models/lasla/classes.py +++ b/pie_extended/models/lasla/classes.py @@ -1,6 +1,13 @@ +from typing import Dict, List, Generator import sys import regex as re import click +from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor +from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor +from pie_extended.pipeline.postprocessor.glue import GlueProcessor +from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer +from pie_extended.models.fro.classes import _RomanNumber, _Dots_except_apostrophe, _Dots_collections try: import cltk @@ -12,23 +19,19 @@ sys.exit(0) -from pie_extended.pipeline.iterators.proto import DataIterator -from pie_extended.pipeline.postprocessor.disambiguator import DisambiguatorProcessor -from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor -from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor -from pie_extended.pipeline.postprocessor.glue import GlueProcessor -from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer as SourceMemorizingTokenizer -from typing import Dict +class LatinRulesProcessor(RuleBasedProcessor): + """ Lasla data has no punctuation, we tag it automatically. + "ne" token can be two different lemma, but I don't remember why I wrote this part. (ne/nec ?) -class LatinRulesProcessor(RuleBasedProcessor): + """ PONCTU = re.compile(r"^\W+$") def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: # If Else condition token = annotation["form"] if self.PONCTU.match(token): - return {"form": token, "lemma": token, "POS": "PUNC", "morph": "MORPH=empty"} + return {"form": token, "lemma": token, "pos": "PUNC", "morph": "MORPH=empty"} elif token.startswith("-"): if token == "-ne": annotation["lemma"] = "ne2" @@ -36,71 +39,101 @@ def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: annotation["lemma"] = "ne" return annotation + def __init__(self, *args, **kwargs): + super(LatinRulesProcessor, self).__init__(*args, **kwargs) + class LatinGlueProcessor(GlueProcessor): OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] GLUE = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]} WHEN_EMPTY = {"morph": "MORPH=empty"} + MAP = {"pos": "POS"} + + def __init__(self, *args, **kwargs): + super(LatinGlueProcessor, self).__init__(*args, **kwargs) + # Uppercase regexp uppercase = re.compile(r"^[A-Z]$") -class MemorizingTokenizer(SourceMemorizingTokenizer): - - re_add_space_around_punct = re.compile(r"(\s*)([^\w\s\.])(\s*)") - re_normalize_space = re.compile(r"(\s+)") - re_sentence_tokenizer = re.compile(r"([_||[^\s\w]]+(?:[\s_||[\W]]+)?)", re.VERSION1) +class LatMemorizingTokenizer(MemorizingTokenizer): + re_add_space_around_punct = re.compile(r"(\s*)(\.+[^\w\s\'’ʼ])(\s*)") + re_add_space_around_apostrophe_that_are_quotes = re.compile( + r"((((?<=[\W])[\'’ʼ]+(?=[\W]))|((?<=[\w])[\'’ʼ]+(?=[\W]))|((?<=[\W])[\'’ʼ]+(?=[\w]))))" + # NotLetter+Apo+NotLetter or Letter+Apo+NotLetter or NotLetter+Apo+Letter + # ?'. or manger'_ or _'Bonjour + ) + re_add_space_after_apostrophe = re.compile(r"(\s*)([\'’ʼ])(\s*)") + re_remove_ending_apostrophe = re.compile(r"(?<=\w)([\'’ʼ])") + _sentence_boundaries = re.compile( + r"([" + _Dots_except_apostrophe + r"]+\s*)+" + ) + roman_number_dot = re.compile(r"\.(" + _RomanNumber + r")\.") def __init__(self): - self.tokens = [ - ] - + super(LatMemorizingTokenizer, self).__init__() + self.tokens = [] self._word_tokenizer = WordTokenizer("latin") - def word_tokenizer(self, data): - return self._word_tokenizer.tokenize(data) - - def sentence_tokenizer(self, data): + @staticmethod + def _sentence_tokenizer_merge_matches(match): + """ Best way we found to deal with repeating groups""" + start, end = match.span() + return match.string[start:end] + "" + + @classmethod + def _real_sentence_tokenizer(cls, string: str) -> List[str]: + string = cls._sentence_boundaries.sub(cls._sentence_tokenizer_merge_matches, string) + string = string.replace("_DOT_", ".") + return string.split("") + + def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + tokenized = [tok for tok in self._word_tokenizer.tokenize(text) if tok] + if tokenized: + tokenized = [tok.lower() for tok in tokenized] + return tokenized + + def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]: sentences = list() - first_is_dot = False - started_writting = False # Allows for avoiding to compute length - for sent in MemorizingTokenizer.re_sentence_tokenizer.split(data): + data = self.normalizer(text) + for sent in self._real_sentence_tokenizer(data): sent = sent.strip() if sent: - if MemorizingTokenizer.re_sentence_tokenizer.match(sent): - if not started_writting: - sentences.append(sent) - first_is_dot = True - else: - sentences[-1] += " " + sent - else: - if first_is_dot: - sentences[-1] += " " + sent - first_is_dot = False - else: - sentences.append(sent) - - if not started_writting and len(sentences): - started_writting = True - + sentences.append(self.word_tokenizer(sent)) yield from sentences + def normalizer(self, data: str) -> str: + data = self.re_remove_ending_apostrophe.sub( + r"\g<1> ", + self.re_add_space_around_apostrophe_that_are_quotes.sub( + r" \g<2> ", + self.re_add_space_around_punct.sub( + r" \g<2> ", + self.roman_number_dot.sub( + r"_DOT_\g<1>_DOT_", + data + ) + ) + ) + ) + return data + def replacer(self, inp: str): - inp = inp.replace("U", "V").replace("v", "u").replace("J", "I").replace("j", "i").lower() + inp = inp.replace("V", "U").replace("v", "u").replace("J", "I").replace("j", "i") return inp - def normalizer(self, data: str): - # Fix regarding the current issue of apostrophe - # https://github.com/cltk/cltk/issues/925#issuecomment-522065530 - # On the other hand, it creates empty tokens... - data = MemorizingTokenizer.re_add_space_around_punct.sub(" \g<2> ", data) - data = MemorizingTokenizer.re_normalize_space.sub(" ", data) - return data + #def normalizer(self, data: str): + # # Fix regarding the current issue of apostrophe + # # https://github.com/cltk/cltk/issues/925#issuecomment-522065530 + # # On the other hand, it creates empty tokens... + # data = MemorizingTokenizer.re_add_space_around_punct.sub(" \g<2> ", data) + # data = MemorizingTokenizer.re_normalize_space.sub(" ", data) + # return data def get_iterator_and_processor(): - tokenizer = MemorizingTokenizer() + tokenizer = LatMemorizingTokenizer() processor = LatinRulesProcessor( MemoryzingProcessor( tokenizer_memory=tokenizer, diff --git a/pie_extended/pipeline/iterators/proto.py b/pie_extended/pipeline/iterators/proto.py index 8229ec5..89d0bae 100644 --- a/pie_extended/pipeline/iterators/proto.py +++ b/pie_extended/pipeline/iterators/proto.py @@ -1,23 +1,22 @@ import regex as re -import string from pie.tagger import simple_tokenizer from typing import Callable, List, Tuple, Dict, Union, Iterable -from ...pipeline.tokenizers.classes import Tokenizer from ...utils import ObjectCreator +from ..tokenizers.simple_tokenizer import SimpleTokenizer Remover = Callable[[List[str]], Tuple[List[str], Dict[int, str]]] PUNKT = re.compile(r"^[_||[^\s\w]]+$", re.VERSION1) class DataIterator: - def __init__(self, tokenizer: Union[ObjectCreator, Tokenizer] = None, remove_from_input: Callable = None): + def __init__(self, tokenizer: SimpleTokenizer = None, remove_from_input: Callable = None): """ Iterator used to parse the text and returns bits to tag :param tokenizer: Tokenizer """ - self.tokenizer = tokenizer or simple_tokenizer + self.tokenizer: SimpleTokenizer = tokenizer or SimpleTokenizer() self.remove_from_input = remove_from_input if self.remove_from_input is None: self.remove_from_input = lambda x: (x, {}) @@ -41,12 +40,6 @@ def remove_punctuation(sentence: List[str]) -> Tuple[List[str], Dict[int, str]]: clean.append(token) return clean, removed - def get_tokenizer(self) -> Tokenizer: - """ Get the tokenizer if it needs to be created""" - if isinstance(self.tokenizer, ObjectCreator): - return self.tokenizer.create() - return self.tokenizer - def get_remover(self) -> Remover: if isinstance(self.remove_from_input, ObjectCreator): return self.remove_from_input.create() @@ -60,8 +53,7 @@ def __call__(self, data: str, lower: bool = False) -> Iterable[Tuple[List[str], :param lower: Whether or not to lower the text :yields: (Sentence as a list of word, Size of the sentence, Elements removed from the sentence) """ - tokenizer = self.get_tokenizer() remover = self.get_remover() - for sentence in tokenizer(data, lower=lower): + for sentence in self.tokenizer.sentence_tokenizer(data, lower=lower): clean_sentence, removed_from_input = remover(sentence) yield clean_sentence, len(clean_sentence), removed_from_input diff --git a/pie_extended/pipeline/postprocessor/glue.py b/pie_extended/pipeline/postprocessor/glue.py index 171d406..0749394 100644 --- a/pie_extended/pipeline/postprocessor/glue.py +++ b/pie_extended/pipeline/postprocessor/glue.py @@ -16,8 +16,8 @@ class GlueProcessor(RenamedTaskProcessor): # Glue Empty are value to take when all things glued together are empty GLUE_EMPTY: Dict[str, str] = {"morph": "MORPH=empty"} - def __init__(self): - super(GlueProcessor, self).__init__() + def __init__(self, *args, **kwargs): + super(GlueProcessor, self).__init__(*args, **kwargs) # Sets-up some copy of the values self._out = type(self).OUTPUT_KEYS @@ -33,7 +33,6 @@ def _yield_annotation( token_dict: Dict[str, str] ) -> Generator[str, None, None]: # For each key we should return - print(self.tasks) for head in self._out: if head not in self._glue: yield head, token_dict[head] diff --git a/pie_extended/pipeline/postprocessor/memory.py b/pie_extended/pipeline/postprocessor/memory.py index 83d7801..b86183e 100644 --- a/pie_extended/pipeline/postprocessor/memory.py +++ b/pie_extended/pipeline/postprocessor/memory.py @@ -26,4 +26,8 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: token_dict[self._key] = out_token token_dict["form"] = input_token - return token_dict \ No newline at end of file + return token_dict + + def reinsert(self, form: str) -> Dict[str, str]: + self.memory.tokens.pop(0) + return super(MemoryzingProcessor, self).reinsert(form) \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/proto.py b/pie_extended/pipeline/postprocessor/proto.py index 4a615f1..d8a7e49 100644 --- a/pie_extended/pipeline/postprocessor/proto.py +++ b/pie_extended/pipeline/postprocessor/proto.py @@ -13,7 +13,6 @@ def __init__(self, empty_value: Optional[str] = None): def set_tasks(self, tasks): self.tasks = tasks - print(tasks, self.tasks) def postprocess(self, line): pass @@ -33,9 +32,7 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: :param tags: :return: """ - print("Do I have task ?", self.tasks) - print({"form":token, **{k: val for k, val in zip(self.tasks, tags)}}) - return {"form":token, **{k: val for k, val in zip(self.tasks, tags)}} + return {"form": token, **{k: val for k, val in zip(self.tasks, tags)}} def reset(self): """ Functions that should be run in between documents """ @@ -43,14 +40,14 @@ def reset(self): class RenamedTaskProcessor(ProcessorPrototype): - MAP: Dict[str, str] + MAP: Dict[str, str] = {} def __init__(self, **kwargs): super(RenamedTaskProcessor, self).__init__(**kwargs) self._map: Dict[str, str] = type(self).MAP def set_tasks(self, tasks): - return [self._map.get(task, task) for task in tasks] + self.tasks = [self._map.get(task, task) for task in tasks] class ChainedProcessor(ProcessorPrototype): diff --git a/pie_extended/pipeline/tokenizers/memorizing.py b/pie_extended/pipeline/tokenizers/memorizing.py index b338529..13cd39e 100644 --- a/pie_extended/pipeline/tokenizers/memorizing.py +++ b/pie_extended/pipeline/tokenizers/memorizing.py @@ -1,45 +1,32 @@ -class MemorizingTokenizer(object): +from .simple_tokenizer import SimpleTokenizer +from typing import List, Tuple, Dict + + +class MemorizingTokenizer(SimpleTokenizer): """ Tokenizer that memoryze what it tokenized. Mostly used to normalized input as input time and then reinserting normalized input """ - @staticmethod - def _sentence_tokenizer(string): - for s in string.split("."): - if s.strip(): - yield s.strip() + " " + "." - - @staticmethod - def _word_tokenizer(string): - for s in string.split(): - if s.strip: - yield s.strip() - - @staticmethod - def _replacer(inp: str): - return inp - - def __init__(self, sentence_tokenizer=None, word_tokenizer=None, replacer=None, normalizer=None): - self.tokens = [ - ] - - self.sentence_tokenizer = sentence_tokenizer or self._sentence_tokenizer - self.word_tokenizer = word_tokenizer or self._word_tokenizer - self.replacer = replacer or self._replacer - self.normalizer = normalizer or self._replacer - - def __call__(self, data, lower=True): - if lower: - data = data.lower() - for sentence in self.sentence_tokenizer(data): - toks = self.word_tokenizer(sentence) - new_sentence = [] - - for tok in toks: - if tok: - out = self.replacer(tok) - self.tokens.append((len(self.tokens), tok, out)) - new_sentence.append(out) - if new_sentence: - yield new_sentence + + def replacer(self, token: str) -> str: + """ This function allows for changing input and keeping it in memory """ + return token + + def __init__(self): + self.tokens: List[Tuple[int, int, str]] = [] + + def _real_word_tokenizer(self, data: str, lower: bool = False) -> List[str]: + return super(MemorizingTokenizer, self).word_tokenizer(data, lower=lower) + + def word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + sentence = [] + for token in self._real_word_tokenizer(text, lower): + out = self.replacer(token) + self.tokens.append((len(self.tokens), token, out)) + sentence.append(out) + return sentence + + def reset(self): # Empty + self.tokens = [] + diff --git a/pie_extended/pipeline/tokenizers/simple_tokenizer.py b/pie_extended/pipeline/tokenizers/simple_tokenizer.py new file mode 100644 index 0000000..ef633a9 --- /dev/null +++ b/pie_extended/pipeline/tokenizers/simple_tokenizer.py @@ -0,0 +1,33 @@ +from typing import Generator, List +import regex as re +import string +from pie.tagger import regexsplitter, SECTION, FULLSTOP + +WORD = r'([{}])'.format(string.punctuation) + + +class SimpleTokenizer(object): + """ Tokenizer that memoryze what it tokenized. + + Mostly used to normalized input as input time and then reinserting normalized input + + """ + def __init__(self): + self.section = regexsplitter(SECTION) + self.fullstop = regexsplitter(FULLSTOP) + self.word = regexsplitter(WORD) + + def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]: + for line in self.section(text): + for sentence in self.fullstop(line): + yield self.word_tokenizer(sentence, lower=lower) + + def word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + sentence = [w for raw in text.split() for w in self.word(raw)] + if lower: + sentence = [w.lower() for w in sentence] + return sentence + + def reset(self): + """Can be used between documents for example """ + pass diff --git a/pie_extended/tagger.py b/pie_extended/tagger.py index 46741e9..413a3cf 100644 --- a/pie_extended/tagger.py +++ b/pie_extended/tagger.py @@ -47,7 +47,10 @@ def tag_str(self, data: str, iterator: DataIterator, processor: ProcessorPrototy def iter_tag_token(self, data: str, iterator: DataIterator, processor: ProcessorPrototype) \ -> Generator[Dict[str, str], None, None]: + # Reset at each document processor.reset() + iterator.tokenizer.reset() + # Iterate ! for chunk in utils.chunks( iterator(data, lower=self.lower), size=self.batch_size): @@ -61,7 +64,8 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor sents=[sent for sent in sents if sent], lengths=lengths ) - processor.set_tasks(tasks) + if not processor.tasks: + processor.set_tasks(tasks) # We keep a real sentence index for sents_index, sent_is_empty in enumerate(is_empty): @@ -84,7 +88,6 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor yield processor.reinsert(sent_reinsertion[reinsertion_index+index]) del sent_reinsertion[reinsertion_index + index] reinsertion_index += 1 - yield processor.get_dict(token, tags) for reinsertion in sorted(list(sent_reinsertion.keys())): @@ -97,7 +100,7 @@ def iter_tag(self, data: str, iterator: DataIterator, processor: type): if not formatter: formatter = Formatter(list(annotation.keys())) yield formatter.write_headers() - yield formatter.write_line(formatter) + yield formatter.write_line(annotation) if formatter: yield formatter.write_footer() \ No newline at end of file diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index 5527508..fa37827 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -1,4 +1,4 @@ -from pie_extended.models.lasla.classes import get_iterator_and_formatter +from pie_extended.models.lasla.classes import get_iterator_and_processor from pie_extended.testing_utils import FakeTagger from typing import List, Tuple @@ -21,8 +21,8 @@ def make_controller(sentences: List[str]): make_fake_data(sentences), tasks="lemma,Voice,Mood,Deg,Numb,Person,Tense,Case,Gend,pos".split(",") ) - iterator, formatter = get_iterator_and_formatter() - return tagger, iterator, formatter + iterator, processor = get_iterator_and_processor() + return tagger, iterator, processor class TestPonctuation(TestCase): @@ -32,7 +32,7 @@ def test_consecutive_dots(self): Found out the hard way it would break things """ - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ "id enim ait turbabuntur a facie eius patris or phanorum et iudicis uiduarum", "causam turbationis hanc docuit quod pater" ]) @@ -40,7 +40,7 @@ def test_consecutive_dots(self): result = tagger.tag_str( data="id enim ait turbabuntur a facie eius patris or phanorum et iudicis uiduarum ." " . causam turbationis hanc docuit quod pater", - postprocessing_class=formatter, + processor=processor, iterator=data_iterator ) self.assertIn( @@ -57,12 +57,12 @@ def test_leading_punctuation(self): Special case of consecutive dots, where sentences starts with it """ - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ "id enim ait", "turbabuntur a facie eius patris or phanorum et iudicis uiduarum" ]) result = tagger.tag_str( "( id enim ait ) turbabuntur a facie eius patris or phanorum et iudicis uiduarum . .", - postprocessing_class=formatter, + processor=processor, iterator=data_iterator ) self.assertIn( @@ -82,12 +82,12 @@ def test_punctuation_is_not_seen(self): """Check that punctuation is not seen by the tagger """ - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ "id enim ait", "turbabuntur a facie eius patris or phanorum et iudicis uiduarum" ]) tagger.tag_str( "( id enim ait ) turbabuntur a facie eius patris or phanorum et iudicis uiduarum . .", - postprocessing_class=formatter, + processor=processor, iterator=data_iterator ) self.assertNotIn( @@ -100,12 +100,12 @@ def test_j_are_temporarly_replaced(self): """Check that characters are replaced for the tagger, thus avoiding out of domain, and reinserted """ - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ "id enim ait", "turbabuntur a facie eius patris or phanorum et iudicis uiduarum" ]) result = tagger.tag_str( "( id enim ait ) turbabuntur a facie eius patris or phanorum et judicis uiduarum . .", - postprocessing_class=formatter, + processor=processor, iterator=data_iterator ) flatten_seen = list([tok for sent in tagger.seen for tok in sent]) @@ -116,12 +116,12 @@ def test_j_are_temporarly_replaced(self): def test_underscores(self): string = "una operatio in ecclesiae fundamento.._... _ . laetatur autem pater quia filius perierat" - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ "una operatio in ecclesiae fundamento", "laetatur autem pater quia filius perierat" ]) tagger.tag_str( string, - postprocessing_class=formatter, + processor=processor, iterator=data_iterator ) flatten_seen = list([tok for sent in tagger.seen for tok in sent]) From 3e519ca8880bc6b03ad0171576571a616850afe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Thu, 20 Feb 2020 19:47:54 +0100 Subject: [PATCH 04/14] Definitely bugging but not crashing though ! --- pie_extended/models/fro/__init__.py | 2 +- pie_extended/models/fro/get.py | 20 ++++++ pie_extended/models/fro/processor.py | 41 ++++++++++++ .../models/fro/{classes.py => tokenizer.py} | 62 +------------------ pie_extended/models/lasla/classes.py | 2 +- 5 files changed, 66 insertions(+), 61 deletions(-) create mode 100644 pie_extended/models/fro/get.py create mode 100644 pie_extended/models/fro/processor.py rename pie_extended/models/fro/{classes.py => tokenizer.py} (60%) diff --git a/pie_extended/models/fro/__init__.py b/pie_extended/models/fro/__init__.py index 63da07b..86506e2 100644 --- a/pie_extended/models/fro/__init__.py +++ b/pie_extended/models/fro/__init__.py @@ -1,5 +1,5 @@ from ...utils import Metadata, File, get_path -from .classes import get_iterator_and_processor +from .get import get_iterator_and_processor from ...pipeline.iterators.proto import DataIterator DESC = Metadata( diff --git a/pie_extended/models/fro/get.py b/pie_extended/models/fro/get.py new file mode 100644 index 0000000..04154d8 --- /dev/null +++ b/pie_extended/models/fro/get.py @@ -0,0 +1,20 @@ +from .processor import FroRulesProcessor, FroGlueProcessor +from .tokenizer import FroMemorizingTokenizer +from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor + + +def get_iterator_and_processor(): + tokenizer = FroMemorizingTokenizer() + processor = FroRulesProcessor( + MemoryzingProcessor( + tokenizer_memory=tokenizer, + head_processor=FroGlueProcessor() + ) + ) + iterator = DataIterator( + tokenizer=tokenizer, + remove_from_input=DataIterator.remove_punctuation + ) + return iterator, processor + diff --git a/pie_extended/models/fro/processor.py b/pie_extended/models/fro/processor.py new file mode 100644 index 0000000..5c54e7a --- /dev/null +++ b/pie_extended/models/fro/processor.py @@ -0,0 +1,41 @@ +import regex as re +from typing import Dict + +from pie_extended.pipeline.postprocessor.glue import GlueProcessor +from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor + + +class FroRulesProcessor(RuleBasedProcessor): + """ Fro Dataset has not all punctuation signs in it, we remove it and posttag it automatically + + """ + PONCTU = re.compile(r"^\W+$") + NUMBER = re.compile(r"\d+") + PONFORT = [".", "...", "!", "?"] + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + token = annotation["form"] + if self.PONCTU.match(token): + if token in self.PONFORT: + pos = "PONfrt" + else: + pos = "PONfbl" + return {"form": token, "lemma": token, "POS": pos, "morph": "MORPH=empty"} + elif self.NUMBER.match(token): + annotation["pos"] = "ADJcar" + return annotation + + def __init__(self, *args, **kwargs): + super(FroRulesProcessor, self).__init__(*args, **kwargs) + + +class FroGlueProcessor(GlueProcessor): + """ We glue morphological features into one column + + """ + OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] + GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]} + MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."} + + def __init__(self, *args, **kwargs): + super(FroGlueProcessor, self).__init__(*args, **kwargs) \ No newline at end of file diff --git a/pie_extended/models/fro/classes.py b/pie_extended/models/fro/tokenizer.py similarity index 60% rename from pie_extended/models/fro/classes.py rename to pie_extended/models/fro/tokenizer.py index 482b696..fb1c3e9 100644 --- a/pie_extended/models/fro/classes.py +++ b/pie_extended/models/fro/tokenizer.py @@ -1,11 +1,7 @@ import regex as re -from typing import List, Dict, Generator -from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer -from pie_extended.pipeline.iterators.proto import DataIterator -from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor -from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor -from pie_extended.pipeline.postprocessor.glue import GlueProcessor +from typing import List, Generator +from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer _Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“" _Dots_collections = r"[" + _Dots_except_apostrophe + "‘’]" @@ -76,56 +72,4 @@ def normalizer(self, data: str) -> str: ) ) ) - return data - - -class FroRulesProcessor(RuleBasedProcessor): - """ Fro Dataset has not all punctuation signs in it, we remove it and posttag it automatically - - """ - PONCTU = re.compile(r"^\W+$") - NUMBER = re.compile(r"\d+") - PONFORT = [".", "...", "!", "?"] - - def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: - token = annotation["form"] - if self.PONCTU.match(token): - if token in self.PONFORT: - pos = "PONfrt" - else: - pos = "PONfbl" - return {"form": token, "lemma": token, "POS": pos, "morph": "MORPH=empty"} - elif self.NUMBER.match(token): - annotation["pos"] = "ADJcar" - return annotation - - def __init__(self, *args, **kwargs): - super(FroRulesProcessor, self).__init__(*args, **kwargs) - - -class FroGlueProcessor(GlueProcessor): - """ We glue morphological features into one column - - """ - OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] - GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]} - MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."} - - def __init__(self, *args, **kwargs): - super(FroGlueProcessor, self).__init__(*args, **kwargs) - - -def get_iterator_and_processor(): - tokenizer = FroMemorizingTokenizer() - processor = FroRulesProcessor( - MemoryzingProcessor( - tokenizer_memory=tokenizer, - head_processor=FroGlueProcessor() - ) - ) - iterator = DataIterator( - tokenizer=tokenizer, - remove_from_input=DataIterator.remove_punctuation - ) - return iterator, processor - + return data \ No newline at end of file diff --git a/pie_extended/models/lasla/classes.py b/pie_extended/models/lasla/classes.py index fd0ad69..001ed89 100644 --- a/pie_extended/models/lasla/classes.py +++ b/pie_extended/models/lasla/classes.py @@ -7,7 +7,7 @@ from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor from pie_extended.pipeline.postprocessor.glue import GlueProcessor from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer -from pie_extended.models.fro.classes import _RomanNumber, _Dots_except_apostrophe, _Dots_collections +from pie_extended.models.fro.tokenizer import _Dots_except_apostrophe, _Dots_collections, _RomanNumber try: import cltk From 9914265c8b0c4e172b2dc5c93ab3cbf78be97332 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Thu, 20 Feb 2020 19:49:57 +0100 Subject: [PATCH 05/14] (Refactored latin as well) --- pie_extended/models/lasla/__init__.py | 2 +- pie_extended/models/lasla/get.py | 24 +++++++ pie_extended/models/lasla/processor.py | 39 +++++++++++ .../models/lasla/{classes.py => tokenizer.py} | 66 ++----------------- 4 files changed, 69 insertions(+), 62 deletions(-) create mode 100644 pie_extended/models/lasla/get.py create mode 100644 pie_extended/models/lasla/processor.py rename pie_extended/models/lasla/{classes.py => tokenizer.py} (63%) diff --git a/pie_extended/models/lasla/__init__.py b/pie_extended/models/lasla/__init__.py index 7074413..512e9f5 100644 --- a/pie_extended/models/lasla/__init__.py +++ b/pie_extended/models/lasla/__init__.py @@ -1,2 +1,2 @@ from pie_extended.models.lasla.consts import DOWNLOADS, Models, Disambiguator, addons, DESC -from pie_extended.models.lasla.classes import get_iterator_and_processor +from pie_extended.models.lasla.get import get_iterator_and_processor diff --git a/pie_extended/models/lasla/get.py b/pie_extended/models/lasla/get.py new file mode 100644 index 0000000..1b83700 --- /dev/null +++ b/pie_extended/models/lasla/get.py @@ -0,0 +1,24 @@ +import regex as re + +from pie_extended.models.lasla.processor import LatinRulesProcessor, LatinGlueProcessor +from pie_extended.models.lasla.tokenizer import LatMemorizingTokenizer +from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor + +# Uppercase regexp +uppercase = re.compile(r"^[A-Z]$") + + +def get_iterator_and_processor(): + tokenizer = LatMemorizingTokenizer() + processor = LatinRulesProcessor( + MemoryzingProcessor( + tokenizer_memory=tokenizer, + head_processor=LatinGlueProcessor() + ) + ) + iterator = DataIterator( + tokenizer=tokenizer, + remove_from_input=DataIterator.remove_punctuation + ) + return iterator, processor diff --git a/pie_extended/models/lasla/processor.py b/pie_extended/models/lasla/processor.py new file mode 100644 index 0000000..c4293b0 --- /dev/null +++ b/pie_extended/models/lasla/processor.py @@ -0,0 +1,39 @@ +import regex as re +from typing import Dict + +from pie_extended.pipeline.postprocessor.glue import GlueProcessor +from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor + + +class LatinRulesProcessor(RuleBasedProcessor): + """ Lasla data has no punctuation, we tag it automatically. + + "ne" token can be two different lemma, but I don't remember why I wrote this part. (ne/nec ?) + + """ + PONCTU = re.compile(r"^\W+$") + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + # If Else condition + token = annotation["form"] + if self.PONCTU.match(token): + return {"form": token, "lemma": token, "pos": "PUNC", "morph": "MORPH=empty"} + elif token.startswith("-"): + if token == "-ne": + annotation["lemma"] = "ne2" + else: + annotation["lemma"] = "ne" + return annotation + + def __init__(self, *args, **kwargs): + super(LatinRulesProcessor, self).__init__(*args, **kwargs) + + +class LatinGlueProcessor(GlueProcessor): + OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] + GLUE = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]} + WHEN_EMPTY = {"morph": "MORPH=empty"} + MAP = {"pos": "POS"} + + def __init__(self, *args, **kwargs): + super(LatinGlueProcessor, self).__init__(*args, **kwargs) \ No newline at end of file diff --git a/pie_extended/models/lasla/classes.py b/pie_extended/models/lasla/tokenizer.py similarity index 63% rename from pie_extended/models/lasla/classes.py rename to pie_extended/models/lasla/tokenizer.py index 001ed89..7d1f6b6 100644 --- a/pie_extended/models/lasla/classes.py +++ b/pie_extended/models/lasla/tokenizer.py @@ -1,13 +1,10 @@ -from typing import Dict, List, Generator -import sys import regex as re import click -from pie_extended.pipeline.iterators.proto import DataIterator -from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor -from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor -from pie_extended.pipeline.postprocessor.glue import GlueProcessor +import sys +from typing import List, Generator + +from pie_extended.models.fro.tokenizer import _Dots_except_apostrophe, _RomanNumber from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer -from pie_extended.models.fro.tokenizer import _Dots_except_apostrophe, _Dots_collections, _RomanNumber try: import cltk @@ -19,44 +16,6 @@ sys.exit(0) -class LatinRulesProcessor(RuleBasedProcessor): - """ Lasla data has no punctuation, we tag it automatically. - - "ne" token can be two different lemma, but I don't remember why I wrote this part. (ne/nec ?) - - """ - PONCTU = re.compile(r"^\W+$") - - def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: - # If Else condition - token = annotation["form"] - if self.PONCTU.match(token): - return {"form": token, "lemma": token, "pos": "PUNC", "morph": "MORPH=empty"} - elif token.startswith("-"): - if token == "-ne": - annotation["lemma"] = "ne2" - else: - annotation["lemma"] = "ne" - return annotation - - def __init__(self, *args, **kwargs): - super(LatinRulesProcessor, self).__init__(*args, **kwargs) - - -class LatinGlueProcessor(GlueProcessor): - OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] - GLUE = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]} - WHEN_EMPTY = {"morph": "MORPH=empty"} - MAP = {"pos": "POS"} - - def __init__(self, *args, **kwargs): - super(LatinGlueProcessor, self).__init__(*args, **kwargs) - - -# Uppercase regexp -uppercase = re.compile(r"^[A-Z]$") - - class LatMemorizingTokenizer(MemorizingTokenizer): re_add_space_around_punct = re.compile(r"(\s*)(\.+[^\w\s\'’ʼ])(\s*)") re_add_space_around_apostrophe_that_are_quotes = re.compile( @@ -129,19 +88,4 @@ def replacer(self, inp: str): # # On the other hand, it creates empty tokens... # data = MemorizingTokenizer.re_add_space_around_punct.sub(" \g<2> ", data) # data = MemorizingTokenizer.re_normalize_space.sub(" ", data) - # return data - - -def get_iterator_and_processor(): - tokenizer = LatMemorizingTokenizer() - processor = LatinRulesProcessor( - MemoryzingProcessor( - tokenizer_memory=tokenizer, - head_processor=LatinGlueProcessor() - ) - ) - iterator = DataIterator( - tokenizer=tokenizer, - remove_from_input=DataIterator.remove_punctuation - ) - return iterator, processor + # return data \ No newline at end of file From dce914870a0c53e53a5c18a5357b2b0bfd987467 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 12:22:58 +0100 Subject: [PATCH 06/14] (Architecture changes) Fixed issues with rules not applied on reinsert. Fixed writing that was writing keys instead of values --- pie_extended/cli/__init__.py | 9 ++++++++- pie_extended/models/fro/get.py | 3 ++- pie_extended/models/fro/processor.py | 4 +++- pie_extended/models/lasla/get.py | 3 ++- pie_extended/models/lasla/processor.py | 2 +- pie_extended/pipeline/postprocessor/glue.py | 18 +++++++++++++----- .../pipeline/postprocessor/rulebased.py | 13 ++++++++++++- pie_extended/tagger.py | 14 +------------- tests/test_models/test_lasla.py | 2 +- 9 files changed, 43 insertions(+), 25 deletions(-) diff --git a/pie_extended/cli/__init__.py b/pie_extended/cli/__init__.py index 0742dbe..b4d727c 100644 --- a/pie_extended/cli/__init__.py +++ b/pie_extended/cli/__init__.py @@ -58,7 +58,14 @@ def tag(model, filepath, allowed_failure, batch_size, device, debug, model_path) """ Tag as many [filepath] as you want with [model] """ from tqdm import tqdm click.echo(click.style("Getting the tagger", bold=True)) - tagger = sub.get_tagger(model, batch_size=batch_size, device=device, model_path=model_path) + try: + tagger = sub.get_tagger(model, batch_size=batch_size, device=device, model_path=model_path) + except FileNotFoundError as e: + click.echo("Model not found: please make sure you have downloaded the model files with " + "pie-extended download " + model) + if debug: + raise e + return failures = [] for file in tqdm(filepath): try: diff --git a/pie_extended/models/fro/get.py b/pie_extended/models/fro/get.py index 04154d8..4bd2a43 100644 --- a/pie_extended/models/fro/get.py +++ b/pie_extended/models/fro/get.py @@ -7,7 +7,8 @@ def get_iterator_and_processor(): tokenizer = FroMemorizingTokenizer() processor = FroRulesProcessor( - MemoryzingProcessor( + apply_on_reinsert=True, + head_processor=MemoryzingProcessor( tokenizer_memory=tokenizer, head_processor=FroGlueProcessor() ) diff --git a/pie_extended/models/fro/processor.py b/pie_extended/models/fro/processor.py index 5c54e7a..d17e51f 100644 --- a/pie_extended/models/fro/processor.py +++ b/pie_extended/models/fro/processor.py @@ -20,7 +20,7 @@ def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: pos = "PONfrt" else: pos = "PONfbl" - return {"form": token, "lemma": token, "POS": pos, "morph": "MORPH=empty"} + return {"form": token, "lemma": token, "POS": pos, "morph": "MORPH=empty", "treated": token} elif self.NUMBER.match(token): annotation["pos"] = "ADJcar" return annotation @@ -36,6 +36,8 @@ class FroGlueProcessor(GlueProcessor): OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]} MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."} + EMPTY_TAG: Dict[str, str] = {"CAS": "_", "NOMB.": "_", "DEGRE": "_", "MODE": "_", "TEMPS": "_", "GENRE": "_", + "PERS.": "_"} def __init__(self, *args, **kwargs): super(FroGlueProcessor, self).__init__(*args, **kwargs) \ No newline at end of file diff --git a/pie_extended/models/lasla/get.py b/pie_extended/models/lasla/get.py index 1b83700..3c6e582 100644 --- a/pie_extended/models/lasla/get.py +++ b/pie_extended/models/lasla/get.py @@ -12,7 +12,8 @@ def get_iterator_and_processor(): tokenizer = LatMemorizingTokenizer() processor = LatinRulesProcessor( - MemoryzingProcessor( + apply_on_reinsert=True, + head_processor=MemoryzingProcessor( tokenizer_memory=tokenizer, head_processor=LatinGlueProcessor() ) diff --git a/pie_extended/models/lasla/processor.py b/pie_extended/models/lasla/processor.py index c4293b0..8af49bd 100644 --- a/pie_extended/models/lasla/processor.py +++ b/pie_extended/models/lasla/processor.py @@ -17,7 +17,7 @@ def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: # If Else condition token = annotation["form"] if self.PONCTU.match(token): - return {"form": token, "lemma": token, "pos": "PUNC", "morph": "MORPH=empty"} + return {"form": token, "lemma": token, "pos": "PUNC", "morph": "MORPH=empty", "treated": token} elif token.startswith("-"): if token == "-ne": annotation["lemma"] = "ne2" diff --git a/pie_extended/pipeline/postprocessor/glue.py b/pie_extended/pipeline/postprocessor/glue.py index 0749394..6a54ed4 100644 --- a/pie_extended/pipeline/postprocessor/glue.py +++ b/pie_extended/pipeline/postprocessor/glue.py @@ -15,15 +15,19 @@ class GlueProcessor(RenamedTaskProcessor): GLUE_CHAR: str = "|" # Glue Empty are value to take when all things glued together are empty GLUE_EMPTY: Dict[str, str] = {"morph": "MORPH=empty"} + # Value that means the current element is empty + EMPTY_TAG: Dict[str, str] = {"Case": "_", "Numb": "_", "Deg": "_", "Mood": "_", "Tense": "_", "Voice": "_", + "Person": "_"} def __init__(self, *args, **kwargs): super(GlueProcessor, self).__init__(*args, **kwargs) # Sets-up some copy of the values - self._out = type(self).OUTPUT_KEYS - self._glue = type(self).GLUE - self._glue_char = type(self).GLUE_CHAR - self._glue_empty = type(self).GLUE_EMPTY + self._out = self.OUTPUT_KEYS + self._glue = self.GLUE + self._glue_char = self.GLUE_CHAR + self._glue_empty = self.GLUE_EMPTY + self._empty_tags = self.EMPTY_TAG def set_tasks(self, tasks): super(GlueProcessor, self).set_tasks(tasks) @@ -38,7 +42,11 @@ def _yield_annotation( yield head, token_dict[head] else: # Otherwise, we glue together things that should be glued together - joined = self._glue_char.join([token_dict[glued_task] for glued_task in self._glue[head]]) + joined = self._glue_char.join([ + glued_task + "=" + token_dict[glued_task] + for glued_task in self._glue[head] + if token_dict[glued_task] != self._empty_tags.get(glued_task, -1) + ]) if not joined: joined = self._glue_empty[head] yield head, joined diff --git a/pie_extended/pipeline/postprocessor/rulebased.py b/pie_extended/pipeline/postprocessor/rulebased.py index 279f97e..d2d5f1b 100644 --- a/pie_extended/pipeline/postprocessor/rulebased.py +++ b/pie_extended/pipeline/postprocessor/rulebased.py @@ -10,12 +10,23 @@ class RuleBasedProcessor(ChainedProcessor): """ KEY: str = "treated" - def __init__(self, head_processor: Optional[ProcessorPrototype], **kwargs): + def __init__(self, apply_on_reinsert: bool = False, head_processor: Optional[ProcessorPrototype] = None, **kwargs): + """ Apply rules on output of the taggers + + :param apply_on_reinsert: Apply rules on reinsert task + """ super(RuleBasedProcessor, self).__init__(head_processor=head_processor, **kwargs) self._key: str = type(self).KEY + self.apply_on_reinsert= apply_on_reinsert def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: return annotation + def reinsert(self, form: str) -> Dict[str, str]: + anno = super(RuleBasedProcessor, self).reinsert(form) + if self.apply_on_reinsert: + return self.rules(anno) + return anno + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: return self.rules(self.head_processor.get_dict(token, tags)) \ No newline at end of file diff --git a/pie_extended/tagger.py b/pie_extended/tagger.py index 413a3cf..90d9ab6 100644 --- a/pie_extended/tagger.py +++ b/pie_extended/tagger.py @@ -19,18 +19,6 @@ def __init__(self, device='cpu', batch_size=100, lower=False, disambiguation=Non ) self.disambiguation: Optional[Disambiguator] = disambiguation - def reinsert_full(self, formatter, sent_reinsertion, tasks): - yield formatter.write_sentence_beginning() - # If a sentence is empty, it's most likely because everything is in sent_reinsertions - for reinsertion in sorted(list(sent_reinsertion.keys())): - yield formatter.write_line( - formatter.format_line( - token=sent_reinsertion[reinsertion], - tags=[""] * len(tasks) - ) - ) - yield formatter.write_sentence_end() - def tag_file(self, fpath: str, iterator: DataIterator, processor: ProcessorPrototype): # Read content of the file with open(fpath) as f: @@ -100,7 +88,7 @@ def iter_tag(self, data: str, iterator: DataIterator, processor: type): if not formatter: formatter = Formatter(list(annotation.keys())) yield formatter.write_headers() - yield formatter.write_line(annotation) + yield formatter.write_line(formatter.format_line(annotation)) if formatter: yield formatter.write_footer() \ No newline at end of file diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index fa37827..f9a1d09 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -1,4 +1,4 @@ -from pie_extended.models.lasla.classes import get_iterator_and_processor +from pie_extended.models.lasla.get import get_iterator_and_processor from pie_extended.testing_utils import FakeTagger from typing import List, Tuple From e5e68bb52ff68292176d1fe3c7401438ff1ec900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 13:06:59 +0100 Subject: [PATCH 07/14] (Fixing tags) --- pie_extended/models/fro/tokenizer.py | 7 +++-- pie_extended/models/lasla/tokenizer.py | 25 ++------------- pie_extended/pipeline/postprocessor/memory.py | 2 +- .../pipeline/tokenizers/memorizing.py | 2 +- pie_extended/tagger.py | 5 +-- tests/test_models/test_lasla.py | 31 ++++++++----------- 6 files changed, 25 insertions(+), 47 deletions(-) diff --git a/pie_extended/models/fro/tokenizer.py b/pie_extended/models/fro/tokenizer.py index fb1c3e9..bd7c80f 100644 --- a/pie_extended/models/fro/tokenizer.py +++ b/pie_extended/models/fro/tokenizer.py @@ -14,7 +14,7 @@ class FroMemorizingTokenizer(MemorizingTokenizer): - re_add_space_around_punct = re.compile(r"(\s*)(\.+[^\w\s\'’ʼ])(\s*)") + re_add_space_around_punct = re.compile(r"(\s*)([^\w\s\'’ʼ])(\s*)") re_add_space_around_apostrophe_that_are_quotes = re.compile( r"((((?<=[\W])[\'’ʼ]+(?=[\W]))|((?<=[\w])[\'’ʼ]+(?=[\W]))|((?<=[\W])[\'’ʼ]+(?=[\w]))))" # NotLetter+Apo+NotLetter or Letter+Apo+NotLetter or NotLetter+Apo+Letter @@ -72,4 +72,7 @@ def normalizer(self, data: str) -> str: ) ) ) - return data \ No newline at end of file + return data + + def replacer(self, inp: str): + return self.re_remove_ending_apostrophe.sub("", inp) \ No newline at end of file diff --git a/pie_extended/models/lasla/tokenizer.py b/pie_extended/models/lasla/tokenizer.py index 7d1f6b6..68b55ae 100644 --- a/pie_extended/models/lasla/tokenizer.py +++ b/pie_extended/models/lasla/tokenizer.py @@ -17,14 +17,7 @@ class LatMemorizingTokenizer(MemorizingTokenizer): - re_add_space_around_punct = re.compile(r"(\s*)(\.+[^\w\s\'’ʼ])(\s*)") - re_add_space_around_apostrophe_that_are_quotes = re.compile( - r"((((?<=[\W])[\'’ʼ]+(?=[\W]))|((?<=[\w])[\'’ʼ]+(?=[\W]))|((?<=[\W])[\'’ʼ]+(?=[\w]))))" - # NotLetter+Apo+NotLetter or Letter+Apo+NotLetter or NotLetter+Apo+Letter - # ?'. or manger'_ or _'Bonjour - ) - re_add_space_after_apostrophe = re.compile(r"(\s*)([\'’ʼ])(\s*)") - re_remove_ending_apostrophe = re.compile(r"(?<=\w)([\'’ʼ])") + re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)") _sentence_boundaries = re.compile( r"([" + _Dots_except_apostrophe + r"]+\s*)+" ) @@ -63,29 +56,15 @@ def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[s yield from sentences def normalizer(self, data: str) -> str: - data = self.re_remove_ending_apostrophe.sub( - r"\g<1> ", - self.re_add_space_around_apostrophe_that_are_quotes.sub( - r" \g<2> ", - self.re_add_space_around_punct.sub( + data = self.re_add_space_around_punct.sub( r" \g<2> ", self.roman_number_dot.sub( r"_DOT_\g<1>_DOT_", data ) ) - ) - ) return data def replacer(self, inp: str): inp = inp.replace("V", "U").replace("v", "u").replace("J", "I").replace("j", "i") return inp - - #def normalizer(self, data: str): - # # Fix regarding the current issue of apostrophe - # # https://github.com/cltk/cltk/issues/925#issuecomment-522065530 - # # On the other hand, it creates empty tokens... - # data = MemorizingTokenizer.re_add_space_around_punct.sub(" \g<2> ", data) - # data = MemorizingTokenizer.re_normalize_space.sub(" ", data) - # return data \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/memory.py b/pie_extended/pipeline/postprocessor/memory.py index b86183e..0ab69ae 100644 --- a/pie_extended/pipeline/postprocessor/memory.py +++ b/pie_extended/pipeline/postprocessor/memory.py @@ -30,4 +30,4 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: def reinsert(self, form: str) -> Dict[str, str]: self.memory.tokens.pop(0) - return super(MemoryzingProcessor, self).reinsert(form) \ No newline at end of file + return super(MemoryzingProcessor, self).reinsert(form) diff --git a/pie_extended/pipeline/tokenizers/memorizing.py b/pie_extended/pipeline/tokenizers/memorizing.py index 13cd39e..7940e80 100644 --- a/pie_extended/pipeline/tokenizers/memorizing.py +++ b/pie_extended/pipeline/tokenizers/memorizing.py @@ -14,7 +14,7 @@ def replacer(self, token: str) -> str: return token def __init__(self): - self.tokens: List[Tuple[int, int, str]] = [] + self.tokens: List[Tuple[int, str, str]] = [] def _real_word_tokenizer(self, data: str, lower: bool = False) -> List[str]: return super(MemorizingTokenizer, self).word_tokenizer(data, lower=lower) diff --git a/pie_extended/tagger.py b/pie_extended/tagger.py index 90d9ab6..604e401 100644 --- a/pie_extended/tagger.py +++ b/pie_extended/tagger.py @@ -46,8 +46,7 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor # to be reinserted sents, lengths, needs_reinsertion = zip(*chunk) - is_empty = [0 == len(sent) for sent in enumerate(sents)] - + is_empty = [not bool(sent) for sent in sents] tagged, tasks = self.tag( sents=[sent for sent in sents if sent], lengths=lengths @@ -72,10 +71,12 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor reinsertion_index = 0 for index, (token, tags) in enumerate(sent): + # Before current index while reinsertion_index + index in sent_reinsertion: yield processor.reinsert(sent_reinsertion[reinsertion_index+index]) del sent_reinsertion[reinsertion_index + index] reinsertion_index += 1 + yield processor.get_dict(token, tags) for reinsertion in sorted(list(sent_reinsertion.keys())): diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index f9a1d09..156bde5 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -43,12 +43,11 @@ def test_consecutive_dots(self): processor=processor, iterator=data_iterator ) - self.assertIn( - "uiduarum uiduarum fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person=fake" - " uiduarum\r\n" - ". . PUNC MORPH=empty .\r\n" - ". . PUNC MORPH=empty .", - result, + self.assertEqual( + result[12], + {"form": "uiduarum", "lemma": "uiduarum", "POS": "fake", "morph": "Case=fake|Numb=fake|Deg=fake|Mood=fake|" + "Tense=fake|Voice=fake|Person=fake", + "treated": "uiduarum"}, "Punctuation should be reinserted and mostly should not break anything" ) @@ -58,23 +57,19 @@ def test_leading_punctuation(self): Special case of consecutive dots, where sentences starts with it """ tagger, data_iterator, processor = make_controller([ + # Need an empty sentence because ( was treated as such "id enim ait", "turbabuntur a facie eius patris or phanorum et iudicis uiduarum" ]) result = tagger.tag_str( - "( id enim ait ) turbabuntur a facie eius patris or phanorum et iudicis uiduarum . .", + "( id enim ait) turbabuntur a facie eius patris or phanorum et iudicis uiduarum ..", processor=processor, iterator=data_iterator ) - self.assertIn( - "form lemma POS morph treated_token\r\n" - "( ( PUNC MORPH=empty (\r\n" - "id id fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person=fake id\r\n" - "enim enim fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person=fake enim\r\n" - "ait ait fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person=fake ait\r\n" - ") ) PUNC MORPH=empty )\r\n" - "turbabuntur turbabuntur fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person" - "=fake turbabuntur\r\n", - result, + tokens = [t["form"] for t in result] + self.assertEqual( + ["(", "id", "enim", "ait", ")", "turbabuntur", "a", "facie", "eius", "patris", "or", "phanorum", + "et", "iudicis", "uiduarum", ".", "."], + tokens, "Leading punctuation should not break anything" ) @@ -130,4 +125,4 @@ def test_underscores(self): 'perierat'], flatten_seen, "Seen element should not count the underscord" - ) \ No newline at end of file + ) From ce97f7474612667473a4393b18f32febbfeb85a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 14:38:59 +0100 Subject: [PATCH 08/14] Fixed tests for Lasla --- tests/test_models/test_lasla.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index 156bde5..c86cd27 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -96,18 +96,17 @@ def test_j_are_temporarly_replaced(self): """ tagger, data_iterator, processor = make_controller([ - "id enim ait", "turbabuntur a facie eius patris or phanorum et iudicis uiduarum" + "iudicis uiduarum" ]) result = tagger.tag_str( - "( id enim ait ) turbabuntur a facie eius patris or phanorum et judicis uiduarum . .", + "judicis uiduarum", processor=processor, iterator=data_iterator ) flatten_seen = list([tok for sent in tagger.seen for tok in sent]) - self.assertNotIn("judicis", flatten_seen, "'j' should be removed from tagging") - self.assertIn("iudicis", flatten_seen, "And 'i' should replace it") - self.assertIn("\njudicis\t", result, "But, in the end, the original form is given to the user") + self.assertEqual(result[0]["form"], "judicis", "'j' should be removed from tagging") + self.assertEqual(result[0]["treated"], "iudicis", "And 'i' should replace it") def test_underscores(self): string = "una operatio in ecclesiae fundamento.._... _ . laetatur autem pater quia filius perierat" From 0943043b20d252bbb60b54bb0ebcf9634724fbf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 16:03:01 +0100 Subject: [PATCH 09/14] Remaining bugs but overall not bad --- pie_extended/models/fro/processor.py | 2 +- pie_extended/models/fro/tokenizer.py | 11 ++++++--- tests/test_models/test_fro.py | 37 ++++++++++++++++++++++++++++ tests/test_models/test_lasla.py | 2 +- 4 files changed, 47 insertions(+), 5 deletions(-) create mode 100644 tests/test_models/test_fro.py diff --git a/pie_extended/models/fro/processor.py b/pie_extended/models/fro/processor.py index d17e51f..21e64ba 100644 --- a/pie_extended/models/fro/processor.py +++ b/pie_extended/models/fro/processor.py @@ -40,4 +40,4 @@ class FroGlueProcessor(GlueProcessor): "PERS.": "_"} def __init__(self, *args, **kwargs): - super(FroGlueProcessor, self).__init__(*args, **kwargs) \ No newline at end of file + super(FroGlueProcessor, self).__init__(*args, **kwargs) diff --git a/pie_extended/models/fro/tokenizer.py b/pie_extended/models/fro/tokenizer.py index bd7c80f..3993587 100644 --- a/pie_extended/models/fro/tokenizer.py +++ b/pie_extended/models/fro/tokenizer.py @@ -16,8 +16,13 @@ class FroMemorizingTokenizer(MemorizingTokenizer): re_add_space_around_punct = re.compile(r"(\s*)([^\w\s\'’ʼ])(\s*)") re_add_space_around_apostrophe_that_are_quotes = re.compile( - r"((((?<=[\W])[\'’ʼ]+(?=[\W]))|((?<=[\w])[\'’ʼ]+(?=[\W]))|((?<=[\W])[\'’ʼ]+(?=[\w]))))" - # NotLetter+Apo+NotLetter or Letter+Apo+NotLetter or NotLetter+Apo+Letter + r"(" + r"(((?<=[\W])[\'’ʼ]+(?=[\W]))|" + r"((?<=[\w])[\'’ʼ]+(?=[\W]))|" + r"((?<=[\W])[\'’ʼ]+(?=[\w])))|" + r"(^[\'’ʼ]+)|" + r"([\'’ʼ]+$))" + # NotLetter+Apo+NotLetter or Letter+Apo+NotLetter or NotLetter+Apo+Letter + Starting or ending apostrophe # ?'. or manger'_ or _'Bonjour ) re_add_space_after_apostrophe = re.compile(r"(\s*)([\'’ʼ])(\s*)") @@ -75,4 +80,4 @@ def normalizer(self, data: str) -> str: return data def replacer(self, inp: str): - return self.re_remove_ending_apostrophe.sub("", inp) \ No newline at end of file + return self.re_remove_ending_apostrophe.sub("", inp) diff --git a/tests/test_models/test_fro.py b/tests/test_models/test_fro.py new file mode 100644 index 0000000..accce3d --- /dev/null +++ b/tests/test_models/test_fro.py @@ -0,0 +1,37 @@ +from pie_extended.models.fro.get import get_iterator_and_processor +from pie_extended.testing_utils import FakeTagger +from typing import List, Tuple + +from unittest import TestCase +from .test_lasla import make_fake_data + + +def make_controller(sentences: List[str]): + # Add the lemmatizer routes + tagger = FakeTagger( + make_fake_data(sentences), + tasks="lemma,MODE,TEMPS,PERS,NOMB,GENRE,CAS,DEGRE,POS".split(",") + ) + iterator, processor = get_iterator_and_processor() + return tagger, iterator, processor + + +class TestFro(TestCase): + def test_elision_apostrophe(self): + string = "q'il meurt" + treated = ["q il meurt"] + tagger, it, pro = make_controller(treated) + out = tagger.tag_str(string, it, pro) + self.assertEqual(out[0]["form"], "q'") + self.assertEqual(out[0]["treated"], "q") + + def test_elision_apostrophe_and_quote(self): + string = "a q'il meurt 'dit il'" + treated = ["a q il meurt dit il"] + tagger, it, pro = make_controller(treated) + out = tagger.tag_str(string, it, pro) + self.assertEqual(out[0]["form"], "a") + self.assertEqual(out[0]["treated"], "a") + self.assertEqual(out[1]["form"], "q'") + self.assertEqual(out[1]["treated"], "q") + # Ending and starting apostrophe are not reinserted for some reason. diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index c86cd27..1359afc 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -25,7 +25,7 @@ def make_controller(sentences: List[str]): return tagger, iterator, processor -class TestPonctuation(TestCase): +class TestLasla(TestCase): def test_consecutive_dots(self): """Check that consecutive punctation does not break anything From 1166fcf74847675e88c20361e4903c4ee7233c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 16:52:29 +0100 Subject: [PATCH 10/14] Fro is tested --- pie_extended/models/fro/tokenizer.py | 31 +++++++++++++++------------- tests/test_models/test_fro.py | 9 ++++---- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/pie_extended/models/fro/tokenizer.py b/pie_extended/models/fro/tokenizer.py index 3993587..bac16b6 100644 --- a/pie_extended/models/fro/tokenizer.py +++ b/pie_extended/models/fro/tokenizer.py @@ -14,7 +14,9 @@ class FroMemorizingTokenizer(MemorizingTokenizer): - re_add_space_around_punct = re.compile(r"(\s*)([^\w\s\'’ʼ])(\s*)") + APOSTROPHES = "'’ʼ" + re_elision_apostrophe = re.compile(r"(\w+)([" + APOSTROPHES + r"])(\w+)") + re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)") re_add_space_around_apostrophe_that_are_quotes = re.compile( r"(" r"(((?<=[\W])[\'’ʼ]+(?=[\W]))|" @@ -42,10 +44,11 @@ def _sentence_tokenizer_merge_matches(match): start, end = match.span() return match.string[start:end] + "" - @classmethod - def _real_sentence_tokenizer(cls, string: str) -> List[str]: - string = cls._sentence_boundaries.sub(cls._sentence_tokenizer_merge_matches, string) + def _real_sentence_tokenizer(self, string: str) -> List[str]: + string = self._sentence_boundaries.sub(self._sentence_tokenizer_merge_matches, string) string = string.replace("_DOT_", ".") + for index_apo, apo in enumerate(self.APOSTROPHES): + string = string.replace("ApOsTrOpHe"+str(index_apo), apo+" ") return string.split("") def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]: @@ -63,19 +66,19 @@ def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[s sentences.append(self.word_tokenizer(sent)) yield from sentences + def apostrophe_replace(self, regex_match) -> str: + return regex_match.group(1) + "ApOsTrOpHe"+ str(self.APOSTROPHES.index(regex_match.group(2))) + regex_match.group(3) + def normalizer(self, data: str) -> str: - data = self.re_remove_ending_apostrophe.sub( - r"\g<1> ", - self.re_add_space_around_apostrophe_that_are_quotes.sub( - r" \g<2> ", - self.re_add_space_around_punct.sub( + data = self.re_add_space_around_punct.sub( r" \g<2> ", - self.roman_number_dot.sub( - r"_DOT_\g<1>_DOT_", - data + self.re_elision_apostrophe.sub( + self.apostrophe_replace, + self.roman_number_dot.sub( + r"_DOT_\g<1>_DOT_", + data + ) ) - ) - ) ) return data diff --git a/tests/test_models/test_fro.py b/tests/test_models/test_fro.py index accce3d..9f80f6d 100644 --- a/tests/test_models/test_fro.py +++ b/tests/test_models/test_fro.py @@ -26,12 +26,13 @@ def test_elision_apostrophe(self): self.assertEqual(out[0]["treated"], "q") def test_elision_apostrophe_and_quote(self): - string = "a q'il meurt 'dit il'" - treated = ["a q il meurt dit il"] + string = "'q'il meurt 'dit il'" + treated = ["q il meurt dit il"] tagger, it, pro = make_controller(treated) out = tagger.tag_str(string, it, pro) - self.assertEqual(out[0]["form"], "a") - self.assertEqual(out[0]["treated"], "a") + self.assertEqual(out[0]["form"], "'") + self.assertEqual(out[0]["treated"], "'") self.assertEqual(out[1]["form"], "q'") self.assertEqual(out[1]["treated"], "q") + self.assertEqual(out[-1]["form"], "'", "Last apostrophe is kept") # Ending and starting apostrophe are not reinserted for some reason. From 01c9dd4940aa5c0bfd6a68200224324654a8d19b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 16:57:49 +0100 Subject: [PATCH 11/14] Updated error for pie-extended install lasla & added a test for roman number in Fro --- pie_extended/models/lasla/tokenizer.py | 2 +- tests/test_models/test_fro.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pie_extended/models/lasla/tokenizer.py b/pie_extended/models/lasla/tokenizer.py index 68b55ae..14f6d29 100644 --- a/pie_extended/models/lasla/tokenizer.py +++ b/pie_extended/models/lasla/tokenizer.py @@ -12,7 +12,7 @@ except ImportError as E: click.echo(click.style("You need to install cltk and its Latin Data to runs this package", fg="red")) click.echo("pip install cltk") - click.echo("pie-ext install-addons lasla") + click.echo("pie-extended install-addons lasla") sys.exit(0) diff --git a/tests/test_models/test_fro.py b/tests/test_models/test_fro.py index 9f80f6d..16d42ea 100644 --- a/tests/test_models/test_fro.py +++ b/tests/test_models/test_fro.py @@ -36,3 +36,14 @@ def test_elision_apostrophe_and_quote(self): self.assertEqual(out[1]["treated"], "q") self.assertEqual(out[-1]["form"], "'", "Last apostrophe is kept") # Ending and starting apostrophe are not reinserted for some reason. + + def test_tokenization_roman_number(self): + iterator, _ = get_iterator_and_processor() + self.assertEqual( + list(iterator.tokenizer.sentence_tokenizer("Les .XIII. tables du Duc du XII.. C'est fantastique")), + [ + ["Les", ".XIII.", "tables", "du", "Duc", "du", "XII", ".", "."], + ["C", 'est', "fantastique"] + ], + "Dots around roman number are not sentences markers" + ) \ No newline at end of file From 50d81e576ded28e1a112515697e78623c23b3510 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 18:20:21 +0100 Subject: [PATCH 12/14] Added a lot of doctests --- .travis.yml | 2 +- pie_extended/pipeline/postprocessor/glue.py | 19 ++++- pie_extended/pipeline/postprocessor/memory.py | 26 +++++- pie_extended/pipeline/postprocessor/proto.py | 81 +++++++++++++++++-- .../pipeline/postprocessor/rulebased.py | 19 ++++- 5 files changed, 134 insertions(+), 13 deletions(-) diff --git a/.travis.yml b/.travis.yml index cc9cae4..7f428a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ install: # command to run tests script: - pie-extended install-addons lasla - - nosetests ./tests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture + - nosetests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture --with-doctest after_success: - coverage combine - coveralls \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/glue.py b/pie_extended/pipeline/postprocessor/glue.py index 6a54ed4..76cbf66 100644 --- a/pie_extended/pipeline/postprocessor/glue.py +++ b/pie_extended/pipeline/postprocessor/glue.py @@ -1,10 +1,27 @@ -from .proto import ProcessorPrototype, RenamedTaskProcessor +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, RenamedTaskProcessor from typing import Generator, Dict, List class GlueProcessor(RenamedTaskProcessor): """ Glues together specific tasks + >>> class SimpleGlue(GlueProcessor): + ... OUTPUT_KEYS = ["form", "lemma", "task3"] + ... GLUE = {"task3": ["1", "2"]} # Merges Task `1` output and task `2` output in `task3` + ... EMPTY_TAG = {"1": "_", "2": "_"} # If _ is tagged in task `1`, it's the same as an empty tag + ... GLUE_EMPTY = {"task3": "NO-DATA"} # When all merged data are empty, default value + >>> x = SimpleGlue() + >>> x.set_tasks(["lemma", "1", "2"]) + >>> # Merges b and c values from task 1 and 2 into a new task + >>> x.get_dict("a", ["a", "b", "c"]) == {"form": "a", "lemma": "a", "task3": "1=b|2=c"} + True + >>> # Keeps only one task because 2 is empty + >>> x.get_dict("a", ["a", "b", "_"]) == {"form": "a", "lemma": "a", "task3": "1=b"} + True + >>> # Fills with the default empty tag because both task 1 and 2 were empty + >>> x.get_dict("a", ["a", "_", "_"]) == {"form": "a", "lemma": "a", "task3": "NO-DATA"} + True + """ # Output keys are keys that are given in the end diff --git a/pie_extended/pipeline/postprocessor/memory.py b/pie_extended/pipeline/postprocessor/memory.py index 0ab69ae..618970e 100644 --- a/pie_extended/pipeline/postprocessor/memory.py +++ b/pie_extended/pipeline/postprocessor/memory.py @@ -1,4 +1,4 @@ -from .proto import ProcessorPrototype, ChainedProcessor +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, ChainedProcessor from typing import Optional, Dict, List if "typing" == "nottyping": from ..tokenizers.memorizing import MemorizingTokenizer @@ -9,6 +9,30 @@ class MemoryzingProcessor(ChainedProcessor): by reinserting the original data alongside a new task (KEY) where we output the input seen by the Model + It reuses the memory from a class derived from MemorizingTokenizer so that it reintroduced + the original input into the token. + + >>> from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer + >>> tokenizer = MemorizingTokenizer() + >>> # Fake token memory : (Index, Original Input, Input seen by Tagger) + >>> tokenizer.tokens = [(0, "A", "a"), (0, "b", "b"), (0, "q'", "q")] + >>> processor = MemoryzingProcessor(tokenizer_memory=tokenizer, head_processor=ProcessorPrototype()) + >>> processor.set_tasks(["lem"]) + >>> # Lowercase a was taken in the input but uppercase a is returned in form. For transparency, input seen + >>> # By the tagger is returned in a new column, treated (cf. MemorizingProcessor.KEY) + >>> processor.get_dict("a", ["lemma"]) == {"form": "A", "treated": "a", "lem": "lemma"} + True + >>> # Some would have the same treated and input + >>> processor.get_dict("b", ["lemma"]) == {"form": "b", "treated": "b", "lem": "lemma"} + True + >>> # Some differ with more characters + >>> processor.get_dict("q", ["lemma"]) == {"form": "q'", "treated": "q", "lem": "lemma"} + True + + This allows for easier output alignment as well as removing unknown characters to the model. If your lemmatizer + in training has never seen the "@" character, you can remove it at tokenization time and reinsert it with + MemoryzingProcessor + """ KEY: str = "treated" diff --git a/pie_extended/pipeline/postprocessor/proto.py b/pie_extended/pipeline/postprocessor/proto.py index d8a7e49..81dbcb3 100644 --- a/pie_extended/pipeline/postprocessor/proto.py +++ b/pie_extended/pipeline/postprocessor/proto.py @@ -8,6 +8,18 @@ class ProcessorPrototype: empty_value: str def __init__(self, empty_value: Optional[str] = None): + """ Applies postprocessing. Simplest Processor one could use. + + :param empty_value: Value to use to fill tasks that would not get any data + + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"} + True + >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"} + True + """ self.tasks = [] self.empty_value = empty_value or DEFAULT_EMPTY @@ -22,20 +34,36 @@ def reinsert(self, form: str) -> Dict[str, str]: :param form: Token to reinsert :return: Dictionary representation of the token, as an annotation + + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"} + True """ return dict(form=form, **{task: self.empty_value for task in self.tasks}) def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: """ Get the dictionary representation of a token annotation - :param token: - :param tags: - :return: + :param token: Token used as input for pie + :param tags: List of tags generated + :return: Dictionary representation of the token and its annotations + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"} + True """ return {"form": token, **{k: val for k, val in zip(self.tasks, tags)}} def reset(self): - """ Functions that should be run in between documents """ + """ Functions that should be run in between documents + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> x.reset() + """ pass @@ -43,6 +71,15 @@ class RenamedTaskProcessor(ProcessorPrototype): MAP: Dict[str, str] = {} def __init__(self, **kwargs): + """ This Processor is used for renaming tasks (Pie for example refuses tasks containing dots) + + >>> class ExampleRemaped(RenamedTaskProcessor): + ... MAP = {"task_name_1": "renamed"} + >>> x = ExampleRemaped() + >>> x.set_tasks(["task_name_1", "y"]) + >>> x.get_dict("token", ["a", "b"]) == {"form": "token", "renamed": "a", "y": "b"} + True + """ super(RenamedTaskProcessor, self).__init__(**kwargs) self._map: Dict[str, str] = type(self).MAP @@ -53,7 +90,39 @@ def set_tasks(self, tasks): class ChainedProcessor(ProcessorPrototype): """ Allows for easy chaining ! - ChainedProcessor(ProcessorPrototype) basically should behave like a normal processor + The ChainedProcessor is basically using its headprocessor in the background and checking it's output to some extent + + The prototype of ChainedProcessor using Processor Prototype would have the same results because + chained processor is not doing anything new except enabling chaining + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> y = ChainedProcessor(x) + >>> y.set_tasks(["a", "b"]) + >>> x.reinsert("x") == y.reinsert("x") + True + >>> x.get_dict("y", ["1", "2"]) == y.get_dict("y", ["1", "2"]) + True + + You can subclass it to modify the output of the preceding processor : + + >>> class ExampleChained(ChainedProcessor): + ... def reinsert(self, form: str) -> Dict[str, str]: + ... annotation = self.head_processor.reinsert(form) + ... annotation["col3"] = "x" + ... return annotation + ... + ... def get_dict(self, form: str, tags: List[str]) -> Dict[str, str]: + ... annotation = self.head_processor.get_dict(form, tags) + ... annotation["col3"] = "x" + ... return annotation + ... + >>> x = ExampleChained(ProcessorPrototype(empty_value="EMPTY")) + >>> x.set_tasks(["a", "b"]) + >>> x.reinsert("x") == {"form": "x", "a": "EMPTY", "b": "EMPTY", "col3": "x"} + True + >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2", "col3": "x"} + True """ head_processor: ProcessorPrototype @@ -76,4 +145,4 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: return self.head_processor.get_dict(token, tags) def reset(self): - self.head_processor.reset() \ No newline at end of file + self.head_processor.reset() diff --git a/pie_extended/pipeline/postprocessor/rulebased.py b/pie_extended/pipeline/postprocessor/rulebased.py index d2d5f1b..0977342 100644 --- a/pie_extended/pipeline/postprocessor/rulebased.py +++ b/pie_extended/pipeline/postprocessor/rulebased.py @@ -1,4 +1,4 @@ -from .proto import ProcessorPrototype, ChainedProcessor +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, ChainedProcessor from typing import Optional, Dict, List if "typing" == "nottyping": from ..tokenizers.memorizing import MemorizingTokenizer @@ -8,16 +8,27 @@ class RuleBasedProcessor(ChainedProcessor): """ Applies rules found in rules(token_annotation) """ - KEY: str = "treated" def __init__(self, apply_on_reinsert: bool = False, head_processor: Optional[ProcessorPrototype] = None, **kwargs): """ Apply rules on output of the taggers :param apply_on_reinsert: Apply rules on reinsert task + :param head_processor: Processor to use before post-processing its results + + >>> class ExampleRule(RuleBasedProcessor): + ... def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + ... if annotation["form"] == "need": + ... annotation["1"] = "REPLACED" + ... return annotation + >>> processor = ExampleRule() + >>> processor.set_tasks(["1", "2"]) + >>> processor.get_dict("token", ["a", "b"]) == {"form": "token", "1": "a", "2": "b"} + True + >>> processor.get_dict("need", ["a", "b"]) == {"form": "need", "1": "REPLACED", "2": "b"} + True """ super(RuleBasedProcessor, self).__init__(head_processor=head_processor, **kwargs) - self._key: str = type(self).KEY - self.apply_on_reinsert= apply_on_reinsert + self.apply_on_reinsert = apply_on_reinsert def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: return annotation From efbfc7da82edbdbefdedfddd6a71d989ae212a17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Sat, 22 Feb 2020 09:54:56 +0100 Subject: [PATCH 13/14] Added documentation on how to run the python API --- README.md | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/README.md b/README.md index 56fb775..e5a3526 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,12 @@ The current system provide an easier access to adding **customized**: - disambiguation, - output formatting +## Install + +To install, simply do `pip install pie-extended`. Then, look at all available models. + +## Run on terminal + But on top of that, it provides a quick and easy way to use others models ! For example, in a shell : ```bash @@ -26,6 +32,53 @@ pie-extended tag laslsa your_file.txt will give you access to all you need ! +## Python API + +You can run the lemmatizer in your own scripts and retrieve token annotations as dictionaries: + +```python +from typing import List +from pie_extended.cli.sub import get_tagger, get_model, download + +# In case you need to download +do_download = False +if do_download: + for dl in download("lasla"): + x = 1 + +# model_path allows you to override the model loaded by another .tar +model_name = "lasla" +tagger = get_tagger(model_name, batch_size=256, device="cpu", model_path=None) + +sentences: List[str] = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit. "] +# Get the main object from the model (: data iterator + postprocesor +from pie_extended.models.lasla import get_iterator_and_processor +for sentence_group in sentences: + iterator, processor = get_iterator_and_processor() + print(tagger.tag_str(sentence_group, iterator=iterator, processor=processor) ) +``` + +will result in + +```python +[{'form': 'lorem', 'lemma': 'lor', 'POS': 'NOMcom', 'morph': 'Case=Acc|Numb=Sing', 'treated': 'lorem'}, + {'form': 'ipsum', 'lemma': 'ipse', 'POS': 'PROdem', 'morph': 'Case=Acc|Numb=Sing', 'treated': 'ipsum'}, + {'form': 'dolor', 'lemma': 'dolor', 'POS': 'NOMcom', 'morph': 'Case=Nom|Numb=Sing', 'treated': 'dolor'}, + {'form': 'sit', 'lemma': 'sum1', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Act|Person=3', + 'treated': 'sit'}, + {'form': 'amet', 'lemma': 'amo', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Act|Person=3', + 'treated': 'amet'}, {'form': ',', 'lemma': ',', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': ','}, + {'form': 'consectetur', 'lemma': 'consector2', 'POS': 'VER', + 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Dep|Person=3', 'treated': 'consectetur'}, + {'form': 'adipiscing', 'lemma': 'adipiscor', 'POS': 'VER', 'morph': 'Tense=Pres|Voice=Dep', 'treated': 'adipiscing'}, + {'form': 'elit', 'lemma': 'elio', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Ind|Tense=Pres|Voice=Act|Person=3', + 'treated': 'elit'}, {'form': '.', 'lemma': '.', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '.'}] +``` + +## Add a model + +ToDo: Documentation + ## Warning This is an extremely early build, subject to change here and there. But it is functional ! \ No newline at end of file From 57e6188855b615b61aa646542ab71baa8bfa5b6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Tue, 25 Feb 2020 14:13:04 +0100 Subject: [PATCH 14/14] Multiple features and improved tests: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Reworked the way we define tokens to be removed - Add the ability to add new removal in the CLI - Reworked GlueProcessor into a ChainedProcessor - New TestTagger that can be used to really simulate things. - Fixed some bugs regarding how empty sentences are dealt with - More documentation here and there Squashed commit of the following: commit c3962973a90e570ccfa46cf91df444ef73b0542b Author: Thibault Clérice Date: Tue Feb 25 14:03:13 2020 +0100 Clean up commit 680d2054d50eae7f660fd2707c3126b1dd480c53 Author: Thibault Clérice Date: Tue Feb 25 13:46:33 2020 +0100 Another attempt at fixing tests commit adff8724758eca0d6a0bbbde727400b8ade691b1 Author: Thibault Clérice Date: Tue Feb 25 13:29:49 2020 +0100 (Fixed tests) commit 2484287518a961c2032adef6b7d190773d256aff Author: Thibault Clérice Date: Tue Feb 25 13:00:56 2020 +0100 Added more tests to check for configuration with a new FakeTagget + Working exclude patterns commit f278d4a6a56cea7c5dd63e9e8bba32531c1d3ecb Author: Thibault Clérice Date: Tue Feb 25 09:44:48 2020 +0100 Try at making token exclusion easier to configure --- .gitignore | 1 + .travis.yml | 1 + pie_extended/cli/__init__.py | 12 +- pie_extended/cli/sub.py | 48 ++++++- pie_extended/models/fro/get.py | 9 +- pie_extended/models/fro/processor.py | 10 +- pie_extended/models/lasla/get.py | 9 +- pie_extended/models/lasla/processor.py | 5 +- pie_extended/pipeline/iterators/proto.py | 94 ++++++++---- .../pipeline/postprocessor/disambiguator.py | 22 --- pie_extended/pipeline/postprocessor/glue.py | 32 +++-- pie_extended/pipeline/postprocessor/memory.py | 10 +- pie_extended/pipeline/postprocessor/proto.py | 41 +++--- .../pipeline/postprocessor/rulebased.py | 1 + pie_extended/pipeline/tokenizers/classes.py | 5 - pie_extended/tagger.py | 17 ++- pie_extended/testing_utils/__init__.py | 71 +++++++++ tests/test_models/test_lasla.py | 136 +++++++++++++++++- 18 files changed, 420 insertions(+), 104 deletions(-) delete mode 100644 pie_extended/pipeline/postprocessor/disambiguator.py delete mode 100644 pie_extended/pipeline/tokenizers/classes.py diff --git a/.gitignore b/.gitignore index c986195..5c424f7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .idea /*.txt pie_extended/downloads/* +tests/**/*.txt # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/.travis.yml b/.travis.yml index 7f428a9..d1f2721 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,6 +12,7 @@ install: # command to run tests script: - pie-extended install-addons lasla + - pie-extended download lasla - nosetests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture --with-doctest after_success: - coverage combine diff --git a/pie_extended/cli/__init__.py b/pie_extended/cli/__init__.py index b4d727c..e63fe34 100644 --- a/pie_extended/cli/__init__.py +++ b/pie_extended/cli/__init__.py @@ -1,6 +1,7 @@ import click from . import sub +from typing import Iterable MODELS = [name for name, *_ in sub.get_list()] @@ -54,7 +55,13 @@ def download(model): help="Raise error when a file is not tagged correctly") @click.option("--model_path", type=str, default=None, help="Provide this with your own model path if you want to test it") -def tag(model, filepath, allowed_failure, batch_size, device, debug, model_path): +@click.option("--reset-exclude-patterns", "reset_patterns", is_flag=True, default=False, + help="Reset exclude patterns") +@click.option("--add-pattern", "add_pattern", + help="Add new exclude patterns for token (Regular expression)", multiple=True) +def tag(model: str, filepath: str, allowed_failure: bool, batch_size: int, device: str, debug: bool, + model_path: str, + reset_patterns: bool, add_pattern: Iterable[str]): """ Tag as many [filepath] as you want with [model] """ from tqdm import tqdm click.echo(click.style("Getting the tagger", bold=True)) @@ -69,7 +76,8 @@ def tag(model, filepath, allowed_failure, batch_size, device, debug, model_path) failures = [] for file in tqdm(filepath): try: - sub.tag_file(model, tagger, file) + sub.tag_file(model, tagger, file, reset_exclude_patterns=reset_patterns, + exclude_patterns=add_pattern) except Exception as E: failures.append(E) click.echo("{} could not be lemmatized".format(file)) diff --git a/pie_extended/cli/sub.py b/pie_extended/cli/sub.py index 13c2e56..b8464d0 100644 --- a/pie_extended/cli/sub.py +++ b/pie_extended/cli/sub.py @@ -1,5 +1,5 @@ import os -from typing import Tuple, Iterable, Generator, Union +from typing import Tuple, Iterable, List, Union from importlib import import_module import requests @@ -11,11 +11,20 @@ from pie.utils import model_spec -def get_model(model): +def get_model(model: str): + """ Retrieve a module given a string + + :param model: Module Name + :return: Module + """ return import_module("{}.{}".format(models.__name__, model)) -def download(module) -> Iterable[Union[str, int]]: +def download(module: str) -> Iterable[Union[str, int]]: + """ Download dependencies for the given module + + :param module: Module for which to download models and static files in general + """ lemmatizer = get_model(module) os.makedirs(os.path.join(PATH, module), exist_ok=True) yield len(lemmatizer.DOWNLOADS) @@ -30,6 +39,8 @@ def download(module) -> Iterable[Union[str, int]]: def get_list() -> Iterable[Tuple[str, Metadata]]: + """ Retrieve a list of available modules + """ for module in models.modules: desc = getattr(get_model(module), "DESC", None) if desc: @@ -37,6 +48,14 @@ def get_list() -> Iterable[Tuple[str, Metadata]]: def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None) -> ExtensibleTagger: + """ Retrieve the tagger + + :param model: Module of the tagger + :param batch_size: Size of the batch + :param device: Device to use (cuda/cpu) + :param model_path: Path to the model if you want to override the package one + :return: Tagger + """ module = get_model(model) disambiguator = getattr(module, "Disambiguator", None) if isinstance(disambiguator, ObjectCreator): @@ -48,9 +67,30 @@ def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None) return tagger -def tag_file(model: str, tagger: ExtensibleTagger, fpath): +def tag_file( + model: str, tagger: ExtensibleTagger, + fpath: str, + reset_exclude_patterns: bool = False, + exclude_patterns: List[str] = None): + """ Tag a file with a given model + + :param model: Module name of the model + :param tagger: Tagger that should be used + :param fpath: Path to the file to edit + :param reset_exclude_patterns: Remove all pre-registered token exclusion regular expressions + :param exclude_patterns: New exclude patterns to add to the data iterator (Does not require reset) + """ module = get_model(model) iterator, processor = getattr(module, "get_iterator_and_processor")() + # Remove first pattern + if reset_exclude_patterns: + iterator.reset_patterns() + + # Add new + if exclude_patterns: + for pattern in exclude_patterns: + iterator.add_pattern(pattern) + tagger.tag_file(fpath, iterator=iterator, processor=processor) return True diff --git a/pie_extended/models/fro/get.py b/pie_extended/models/fro/get.py index 4bd2a43..2019473 100644 --- a/pie_extended/models/fro/get.py +++ b/pie_extended/models/fro/get.py @@ -1,6 +1,7 @@ from .processor import FroRulesProcessor, FroGlueProcessor +from pie_extended.pipeline.postprocessor.proto import RenamedTaskProcessor from .tokenizer import FroMemorizingTokenizer -from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor @@ -10,12 +11,14 @@ def get_iterator_and_processor(): apply_on_reinsert=True, head_processor=MemoryzingProcessor( tokenizer_memory=tokenizer, - head_processor=FroGlueProcessor() + head_processor=FroGlueProcessor( + head_processor=RenamedTaskProcessor({"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."}) + ) ) ) iterator = DataIterator( tokenizer=tokenizer, - remove_from_input=DataIterator.remove_punctuation + exclude_patterns=[GenericExcludePatterns.Punctuation_and_Underscore] ) return iterator, processor diff --git a/pie_extended/models/fro/processor.py b/pie_extended/models/fro/processor.py index 21e64ba..64637e1 100644 --- a/pie_extended/models/fro/processor.py +++ b/pie_extended/models/fro/processor.py @@ -1,16 +1,17 @@ import regex as re -from typing import Dict +from typing import Dict, Pattern from pie_extended.pipeline.postprocessor.glue import GlueProcessor from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor +from pie_extended.pipeline.postprocessor.proto import RenamedTaskProcessor class FroRulesProcessor(RuleBasedProcessor): """ Fro Dataset has not all punctuation signs in it, we remove it and posttag it automatically """ - PONCTU = re.compile(r"^\W+$") - NUMBER = re.compile(r"\d+") + PONCTU: Pattern = re.compile(r"^\W+$") + NUMBER: Pattern = re.compile(r"\d+") PONFORT = [".", "...", "!", "?"] def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: @@ -35,9 +36,8 @@ class FroGlueProcessor(GlueProcessor): """ OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]} - MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."} EMPTY_TAG: Dict[str, str] = {"CAS": "_", "NOMB.": "_", "DEGRE": "_", "MODE": "_", "TEMPS": "_", "GENRE": "_", "PERS.": "_"} def __init__(self, *args, **kwargs): - super(FroGlueProcessor, self).__init__(*args, **kwargs) + super(FroGlueProcessor, self).__init__(*args, **kwargs) \ No newline at end of file diff --git a/pie_extended/models/lasla/get.py b/pie_extended/models/lasla/get.py index 3c6e582..eedb03d 100644 --- a/pie_extended/models/lasla/get.py +++ b/pie_extended/models/lasla/get.py @@ -1,8 +1,9 @@ import regex as re from pie_extended.models.lasla.processor import LatinRulesProcessor, LatinGlueProcessor +from pie_extended.pipeline.postprocessor.proto import RenamedTaskProcessor, ProcessorPrototype from pie_extended.models.lasla.tokenizer import LatMemorizingTokenizer -from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor # Uppercase regexp @@ -15,11 +16,13 @@ def get_iterator_and_processor(): apply_on_reinsert=True, head_processor=MemoryzingProcessor( tokenizer_memory=tokenizer, - head_processor=LatinGlueProcessor() + head_processor=LatinGlueProcessor( + ProcessorPrototype() + ) ) ) iterator = DataIterator( tokenizer=tokenizer, - remove_from_input=DataIterator.remove_punctuation + exclude_patterns=[GenericExcludePatterns.Punctuation_and_Underscore] ) return iterator, processor diff --git a/pie_extended/models/lasla/processor.py b/pie_extended/models/lasla/processor.py index 8af49bd..fd17347 100644 --- a/pie_extended/models/lasla/processor.py +++ b/pie_extended/models/lasla/processor.py @@ -30,10 +30,9 @@ def __init__(self, *args, **kwargs): class LatinGlueProcessor(GlueProcessor): - OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] + OUTPUT_KEYS = ["form", "lemma", "pos", "morph"] GLUE = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]} WHEN_EMPTY = {"morph": "MORPH=empty"} - MAP = {"pos": "POS"} def __init__(self, *args, **kwargs): - super(LatinGlueProcessor, self).__init__(*args, **kwargs) \ No newline at end of file + super(LatinGlueProcessor, self).__init__(*args, **kwargs) diff --git a/pie_extended/pipeline/iterators/proto.py b/pie_extended/pipeline/iterators/proto.py index 89d0bae..42a95b5 100644 --- a/pie_extended/pipeline/iterators/proto.py +++ b/pie_extended/pipeline/iterators/proto.py @@ -1,50 +1,97 @@ import regex as re -from pie.tagger import simple_tokenizer -from typing import Callable, List, Tuple, Dict, Union, Iterable +from typing import List, Tuple, Dict, Iterable, Pattern, Union -from ...utils import ObjectCreator -from ..tokenizers.simple_tokenizer import SimpleTokenizer +from pie_extended.pipeline.tokenizers.simple_tokenizer import SimpleTokenizer +from enum import Enum -Remover = Callable[[List[str]], Tuple[List[str], Dict[int, str]]] -PUNKT = re.compile(r"^[_||[^\s\w]]+$", re.VERSION1) + +class GenericExcludePatterns(Enum): + """ Useful set of regular expresion that can be used for the exclude_patterns + + """ + Punctuation_and_Underscore: Pattern = re.compile(r"^[_||[^\s\w]]+$", re.VERSION1) + Punctuation: Pattern = re.compile(r"^[^\s\w]+$") + PassageMarker: Pattern = re.compile(r"_Passage_[\w\d_]+") # Use `_` as a joining character class DataIterator: - def __init__(self, tokenizer: SimpleTokenizer = None, remove_from_input: Callable = None): + def __init__(self, tokenizer: SimpleTokenizer = None, exclude_patterns: List[Union[str, Pattern]] = None): """ Iterator used to parse the text and returns bits to tag :param tokenizer: Tokenizer """ self.tokenizer: SimpleTokenizer = tokenizer or SimpleTokenizer() - self.remove_from_input = remove_from_input - if self.remove_from_input is None: - self.remove_from_input = lambda x: (x, {}) + self.exclude_patterns: List[Pattern] = [] + if exclude_patterns: + for pattern in exclude_patterns: + self.add_pattern(pattern) + + def add_pattern(self, pattern: str): + """ Add a pattern for removal + + :param pattern: Pattern for token removal + """ + if isinstance(pattern, str): + self.exclude_patterns.append(re.compile(pattern)) + elif hasattr(pattern, "value"): # Deal with enum + self.exclude_patterns.append(pattern.value) + else: + self.exclude_patterns.append(pattern) + + def reset_patterns(self) -> None: + """ Removes removal patterns + + >>> x = DataIterator(exclude_patterns=[r'\W+']) + >>> x.exclude_tokens(["Je", "suis", "content", ",", "mais", "...", '"', "fatigué", '"', "."]) + (['Je', 'suis', 'content', 'mais', 'fatigué'], {3: ',', 5: '...', 6: '"', 8: '"', 9: '.'}) + >>> x.reset_patterns() + >>> x.exclude_tokens(["Je", "suis", "content", ",", "mais", "...", '"', "fatigué", '"', "."]) + (['Je', 'suis', 'content', ',', 'mais', '...', '"', 'fatigué', '"', '.'], {}) + """ + self.exclude_patterns = [] - @staticmethod - def remove_punctuation(sentence: List[str]) -> Tuple[List[str], Dict[int, str]]: + def exclude_tokens(self, sentence: List[str]) -> Tuple[List[str], Dict[int, str]]: """ Removes punctuation from a list and keeps its index :param sentence: :return: First the sentence with things removed, then a dictionary whose keys are index of token to reinsert and associated values are punctuation to reinsert. - >>> x = DataIterator.remove_punctuation(["Je", "suis", "content",",", "mais", "...", '"', "fatigué", '"', "."]) - >>> assert x == (['Je', 'suis', 'content', 'mais', 'fatigué'], {3: ',', 5: '...', 6: '"', 8: '"', 9: '.'}) + You can use string when generating the exclude_pattern + + >>> x = DataIterator(exclude_patterns=[r'\W+']) + >>> x.exclude_tokens(["Je", "suis", "content",",", "mais", "...", '"', "fatigué", '"', "."]) + (['Je', 'suis', 'content', 'mais', 'fatigué'], {3: ',', 5: '...', 6: '"', 8: '"', 9: '.'}) + + Pre-built removers: + + >>> x = DataIterator(exclude_patterns=[GenericExcludePatterns.PassageMarker]) + >>> x.exclude_tokens(["_Passage_45_78", "Ici", "commence", "le", "passage"]) + (['Ici', 'commence', 'le', 'passage'], {0: '_Passage_45_78'}) + + And of course you can ignore this option + + >>> x = DataIterator() + >>> x.exclude_tokens(["_Passage_45_78", "Ici", "commence", "le", "passage"]) + (['_Passage_45_78', 'Ici', 'commence', 'le', 'passage'], {}) + """ + if len(self.exclude_patterns) == 0: + return sentence, {} + clean, removed = [], {} for index, token in enumerate(sentence): - if PUNKT.match(token): - removed[index] = token - else: + match = False + for exclude_pattern in self.exclude_patterns: + if exclude_pattern.match(token): + removed[index] = token + match = True + break + if not match: clean.append(token) return clean, removed - def get_remover(self) -> Remover: - if isinstance(self.remove_from_input, ObjectCreator): - return self.remove_from_input.create() - return self.remove_from_input - def __call__(self, data: str, lower: bool = False) -> Iterable[Tuple[List[str], int, Dict[int, str]]]: """ Default iter data takes a text, an option to make lower and yield lists of words along with the length of the list @@ -53,7 +100,6 @@ def __call__(self, data: str, lower: bool = False) -> Iterable[Tuple[List[str], :param lower: Whether or not to lower the text :yields: (Sentence as a list of word, Size of the sentence, Elements removed from the sentence) """ - remover = self.get_remover() for sentence in self.tokenizer.sentence_tokenizer(data, lower=lower): - clean_sentence, removed_from_input = remover(sentence) + clean_sentence, removed_from_input = self.exclude_tokens(sentence) yield clean_sentence, len(clean_sentence), removed_from_input diff --git a/pie_extended/pipeline/postprocessor/disambiguator.py b/pie_extended/pipeline/postprocessor/disambiguator.py deleted file mode 100644 index 79d0895..0000000 --- a/pie_extended/pipeline/postprocessor/disambiguator.py +++ /dev/null @@ -1,22 +0,0 @@ -from ..disambiguators.proto import Disambiguator -from .proto import ProcessorPrototype, ChainedProcessor -from typing import Optional, Dict, List - - -# Right now disambiguation is applied at the sentence level. Question is should we ? -# Keeping that here for the moment - -class DisambiguatorProcessor(ChainedProcessor): - """ Applies rules found in rules(token_annotation) - - """ - - def __init__(self, disambiguator: Disambiguator, head_processor: Optional[ProcessorPrototype], **kwargs): - super(DisambiguatorProcessor, self).__init__(head_processor=head_processor, **kwargs) - self.disambiguator: Disambiguator = disambiguator - - def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: - return annotation - - def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: - return self.rules(self.head_processor.get_dict(token, tags)) \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/glue.py b/pie_extended/pipeline/postprocessor/glue.py index 76cbf66..305951e 100644 --- a/pie_extended/pipeline/postprocessor/glue.py +++ b/pie_extended/pipeline/postprocessor/glue.py @@ -1,8 +1,8 @@ -from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, RenamedTaskProcessor +from pie_extended.pipeline.postprocessor.proto import ChainedProcessor, ProcessorPrototype, RenamedTaskProcessor from typing import Generator, Dict, List -class GlueProcessor(RenamedTaskProcessor): +class GlueProcessor(ChainedProcessor): """ Glues together specific tasks >>> class SimpleGlue(GlueProcessor): @@ -10,8 +10,9 @@ class GlueProcessor(RenamedTaskProcessor): ... GLUE = {"task3": ["1", "2"]} # Merges Task `1` output and task `2` output in `task3` ... EMPTY_TAG = {"1": "_", "2": "_"} # If _ is tagged in task `1`, it's the same as an empty tag ... GLUE_EMPTY = {"task3": "NO-DATA"} # When all merged data are empty, default value - >>> x = SimpleGlue() - >>> x.set_tasks(["lemma", "1", "2"]) + >>> x = SimpleGlue(head_processor=ProcessorPrototype()) + >>> x.set_tasks(["lemma", "1", "2"]) # You can see things are remaped + ['lemma', 'task3'] >>> # Merges b and c values from task 1 and 2 into a new task >>> x.get_dict("a", ["a", "b", "c"]) == {"form": "a", "lemma": "a", "task3": "1=b|2=c"} True @@ -22,6 +23,20 @@ class GlueProcessor(RenamedTaskProcessor): >>> x.get_dict("a", ["a", "_", "_"]) == {"form": "a", "lemma": "a", "task3": "NO-DATA"} True + You can also use remaped tasks: + + >>> class AnotherGlue(GlueProcessor): + ... OUTPUT_KEYS = ["form", "lemma", "POS", "task3"] + ... GLUE = {"task3": ["1", "2"]} # Merges Task `1` output and task `2` output in `task3` + ... EMPTY_TAG = {"1": "_", "2": "_"} # If _ is tagged in task `1`, it's the same as an empty tag + ... GLUE_EMPTY = {"task3": "NO-DATA"} # When all merged data are empty, default value + >>> x = AnotherGlue(head_processor=RenamedTaskProcessor({"pos": "POS"})) + >>> x.set_tasks(["lemma", "pos", "1", "2"]) # You can see things are remaped + ['lemma', 'POS', 'task3'] + >>> # Merges b and c values from task 1 and 2 into a new task + >>> x.get_dict("a", ["a", "p", "b", "c"]) + {'form': 'a', 'lemma': 'a', 'POS': 'p', 'task3': '1=b|2=c'} + """ # Output keys are keys that are given in the end @@ -46,9 +61,6 @@ def __init__(self, *args, **kwargs): self._glue_empty = self.GLUE_EMPTY self._empty_tags = self.EMPTY_TAG - def set_tasks(self, tasks): - super(GlueProcessor, self).set_tasks(tasks) - def _yield_annotation( self, token_dict: Dict[str, str] @@ -62,7 +74,7 @@ def _yield_annotation( joined = self._glue_char.join([ glued_task + "=" + token_dict[glued_task] for glued_task in self._glue[head] - if token_dict[glued_task] != self._empty_tags.get(glued_task, -1) + if token_dict[glued_task] != self._empty_tags.get(glued_task, None) ]) if not joined: joined = self._glue_empty[head] @@ -74,3 +86,7 @@ def reinsert(self, form: str) -> Dict[str, str]: def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: as_dict = super(GlueProcessor, self).get_dict(token, tags) return dict(self._yield_annotation(as_dict)) + + @property + def tasks(self) -> List[str]: + return [key for key in self._out if key != "form"] diff --git a/pie_extended/pipeline/postprocessor/memory.py b/pie_extended/pipeline/postprocessor/memory.py index 618970e..6f0cd5a 100644 --- a/pie_extended/pipeline/postprocessor/memory.py +++ b/pie_extended/pipeline/postprocessor/memory.py @@ -18,6 +18,7 @@ class MemoryzingProcessor(ChainedProcessor): >>> tokenizer.tokens = [(0, "A", "a"), (0, "b", "b"), (0, "q'", "q")] >>> processor = MemoryzingProcessor(tokenizer_memory=tokenizer, head_processor=ProcessorPrototype()) >>> processor.set_tasks(["lem"]) + ['lem', 'treated'] >>> # Lowercase a was taken in the input but uppercase a is returned in form. For transparency, input seen >>> # By the tagger is returned in a new column, treated (cf. MemorizingProcessor.KEY) >>> processor.get_dict("a", ["lemma"]) == {"form": "A", "treated": "a", "lem": "lemma"} @@ -36,10 +37,11 @@ class MemoryzingProcessor(ChainedProcessor): """ KEY: str = "treated" - def __init__(self, tokenizer_memory: "MemorizingTokenizer", head_processor: Optional[ProcessorPrototype], **kwargs): + def __init__(self, tokenizer_memory: "MemorizingTokenizer", head_processor: ProcessorPrototype, + key: Optional[str] = None, **kwargs): super(MemoryzingProcessor, self).__init__(head_processor=head_processor, **kwargs) self.memory: "MemorizingTokenizer" = tokenizer_memory - self._key: str = type(self).KEY + self._key: str = key or type(self).KEY def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: # First we get the dictionary @@ -52,6 +54,10 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: token_dict["form"] = input_token return token_dict + @property + def tasks(self) -> List[str]: + return self.head_processor.tasks + ["treated"] + def reinsert(self, form: str) -> Dict[str, str]: self.memory.tokens.pop(0) return super(MemoryzingProcessor, self).reinsert(form) diff --git a/pie_extended/pipeline/postprocessor/proto.py b/pie_extended/pipeline/postprocessor/proto.py index 81dbcb3..150c9f4 100644 --- a/pie_extended/pipeline/postprocessor/proto.py +++ b/pie_extended/pipeline/postprocessor/proto.py @@ -4,7 +4,6 @@ class ProcessorPrototype: - tasks: List[str] empty_value: str def __init__(self, empty_value: Optional[str] = None): @@ -15,16 +14,22 @@ def __init__(self, empty_value: Optional[str] = None): >>> x = ProcessorPrototype(empty_value="%") >>> x.set_tasks(["a", "b"]) + ['a', 'b'] >>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"} True >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"} True """ - self.tasks = [] + self._tasks = [] self.empty_value = empty_value or DEFAULT_EMPTY - def set_tasks(self, tasks): - self.tasks = tasks + @property + def tasks(self) -> List[str]: + return self._tasks + + def set_tasks(self, tasks) -> List[str]: + self._tasks = tasks + return tasks def postprocess(self, line): pass @@ -38,10 +43,11 @@ def reinsert(self, form: str) -> Dict[str, str]: >>> x = ProcessorPrototype(empty_value="%") >>> x.set_tasks(["a", "b"]) + ['a', 'b'] >>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"} True """ - return dict(form=form, **{task: self.empty_value for task in self.tasks}) + return dict(form=form, **{task: self.empty_value for task in self._tasks}) def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: """ Get the dictionary representation of a token annotation @@ -52,39 +58,39 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: >>> x = ProcessorPrototype(empty_value="%") >>> x.set_tasks(["a", "b"]) + ['a', 'b'] >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"} True """ - return {"form": token, **{k: val for k, val in zip(self.tasks, tags)}} + return {"form": token, **{k: val for k, val in zip(self._tasks, tags)}} def reset(self): """ Functions that should be run in between documents >>> x = ProcessorPrototype(empty_value="%") >>> x.set_tasks(["a", "b"]) + ['a', 'b'] >>> x.reset() """ pass class RenamedTaskProcessor(ProcessorPrototype): - MAP: Dict[str, str] = {} - - def __init__(self, **kwargs): + def __init__(self, task_map: Dict[str, str], **kwargs): """ This Processor is used for renaming tasks (Pie for example refuses tasks containing dots) - >>> class ExampleRemaped(RenamedTaskProcessor): - ... MAP = {"task_name_1": "renamed"} - >>> x = ExampleRemaped() + >>> x = RenamedTaskProcessor({"task_name_1": "renamed"}) >>> x.set_tasks(["task_name_1", "y"]) + ['renamed', 'y'] >>> x.get_dict("token", ["a", "b"]) == {"form": "token", "renamed": "a", "y": "b"} True """ super(RenamedTaskProcessor, self).__init__(**kwargs) - self._map: Dict[str, str] = type(self).MAP + self._map: Dict[str, str] = task_map def set_tasks(self, tasks): - self.tasks = [self._map.get(task, task) for task in tasks] + self._tasks = [self._map.get(task, task) for task in tasks] + return self.tasks class ChainedProcessor(ProcessorPrototype): @@ -97,8 +103,10 @@ class ChainedProcessor(ProcessorPrototype): >>> x = ProcessorPrototype(empty_value="%") >>> x.set_tasks(["a", "b"]) + ['a', 'b'] >>> y = ChainedProcessor(x) >>> y.set_tasks(["a", "b"]) + ['a', 'b'] >>> x.reinsert("x") == y.reinsert("x") True >>> x.get_dict("y", ["1", "2"]) == y.get_dict("y", ["1", "2"]) @@ -119,6 +127,7 @@ class ChainedProcessor(ProcessorPrototype): ... >>> x = ExampleChained(ProcessorPrototype(empty_value="EMPTY")) >>> x.set_tasks(["a", "b"]) + ['a', 'b'] >>> x.reinsert("x") == {"form": "x", "a": "EMPTY", "b": "EMPTY", "col3": "x"} True >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2", "col3": "x"} @@ -135,8 +144,8 @@ def __init__(self, head_processor: Optional[ProcessorPrototype], **kwargs): self.head_processor = ProcessorPrototype() def set_tasks(self, tasks): - super(ChainedProcessor, self).set_tasks(tasks) - self.head_processor.set_tasks(tasks) + self._tasks = self.head_processor.set_tasks(tasks) + return self.tasks def reinsert(self, form: str) -> Dict[str, str]: return self.head_processor.reinsert(form) diff --git a/pie_extended/pipeline/postprocessor/rulebased.py b/pie_extended/pipeline/postprocessor/rulebased.py index 0977342..7abed02 100644 --- a/pie_extended/pipeline/postprocessor/rulebased.py +++ b/pie_extended/pipeline/postprocessor/rulebased.py @@ -22,6 +22,7 @@ def __init__(self, apply_on_reinsert: bool = False, head_processor: Optional[Pro ... return annotation >>> processor = ExampleRule() >>> processor.set_tasks(["1", "2"]) + ['1', '2'] >>> processor.get_dict("token", ["a", "b"]) == {"form": "token", "1": "a", "2": "b"} True >>> processor.get_dict("need", ["a", "b"]) == {"form": "need", "1": "REPLACED", "2": "b"} diff --git a/pie_extended/pipeline/tokenizers/classes.py b/pie_extended/pipeline/tokenizers/classes.py deleted file mode 100644 index 71e21a0..0000000 --- a/pie_extended/pipeline/tokenizers/classes.py +++ /dev/null @@ -1,5 +0,0 @@ -from typing import Callable, Iterable, List - -Tokenizer = Callable[[str, bool], Iterable[List[str]]] - - diff --git a/pie_extended/tagger.py b/pie_extended/tagger.py index 604e401..9e22ce6 100644 --- a/pie_extended/tagger.py +++ b/pie_extended/tagger.py @@ -26,10 +26,13 @@ def tag_file(self, fpath: str, iterator: DataIterator, processor: ProcessorProto _, ext = os.path.splitext(fpath) - with open(utils.ensure_ext(fpath, ext, 'pie'), 'w+') as f: + out_file = utils.ensure_ext(fpath, ext, 'pie') + with open(out_file, 'w+') as f: for line in self.iter_tag(data, iterator, processor=processor): f.write(line) + return out_file + def tag_str(self, data: str, iterator: DataIterator, processor: ProcessorPrototype) -> str: return list(self.iter_tag_token(data, iterator, processor=processor)) @@ -42,15 +45,19 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor for chunk in utils.chunks( iterator(data, lower=self.lower), size=self.batch_size): + # Unzip the batch into the sentences, their sizes and the dictionaries of things that needs # to be reinserted + sents, lengths, needs_reinsertion = zip(*chunk) is_empty = [not bool(sent) for sent in sents] + tagged, tasks = self.tag( sents=[sent for sent in sents if sent], - lengths=lengths + lengths=[l for l in lengths if l != 0] ) + if not processor.tasks: processor.set_tasks(tasks) @@ -65,7 +72,7 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor sent_reinsertion = needs_reinsertion[sents_index] # If we have a disambiguator, we run the results into it - if self.disambiguation: + if self.disambiguation and sent: sent = self.disambiguation(sent, tasks) reinsertion_index = 0 @@ -82,12 +89,12 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor for reinsertion in sorted(list(sent_reinsertion.keys())): yield processor.reinsert(sent_reinsertion[reinsertion]) - def iter_tag(self, data: str, iterator: DataIterator, processor: type): + def iter_tag(self, data: str, iterator: DataIterator, processor: ProcessorPrototype): formatter = None for annotation in self.iter_tag_token(data, iterator, processor): if not formatter: - formatter = Formatter(list(annotation.keys())) + formatter = Formatter(processor.tasks) yield formatter.write_headers() yield formatter.write_line(formatter.format_line(annotation)) diff --git a/pie_extended/testing_utils/__init__.py b/pie_extended/testing_utils/__init__.py index 8c8ef57..3754427 100644 --- a/pie_extended/testing_utils/__init__.py +++ b/pie_extended/testing_utils/__init__.py @@ -1,4 +1,8 @@ +from typing import List, Tuple +from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype from pie_extended.tagger import ExtensibleTagger +from pie.utils import model_spec class FakeTagger(ExtensibleTagger): @@ -14,3 +18,70 @@ def tag(self, sents, **kwargs): self.seen.extend(sents) return self.tokens, self.tasks + + +class FakeAutoTag(ExtensibleTagger): + def __init__(self, tasks: List[str], **kwargs): + self.tokens: List[str] = [] + self.lengths: List[int] = [] + self.tasks = tasks + for key in kwargs: + setattr(self, key, kwargs[key]) + + def tag(self, sents: List[List[str]], lengths: List[int], *args, **kwargs): + """ Fake tagging tokens by enumerating informations + + >>> tagger = FakeAutoTag(["pos", "lemma"]) + >>> tagger.tag([['a', 'b'], ['c']], lengths=[2, 1]) + ([[('a', ('pos0', 'lemma0')), ('b', ('pos1', 'lemma1'))], [('c', ('pos2', 'lemma2'))]], ['pos', 'lemma']) + + """ + self.tokens.extend(list(sents)) + self.lengths.extend(lengths) + + for t, l in zip(sents, lengths): + if len(t) != l: + raise ValueError("Tokens and lengths are inequal [len({}) != {}]".format(str(t), l)) + + out = [] + total = 0 + + def get_task(task, i): + return task+str(i) + + for sent in sents: + out.append([]) + for tok in sent: + out[-1].append((tok, tuple(list(get_task(task, total) for task in self.tasks)))) + total += 1 + return out, self.tasks + + @staticmethod + def from_model_string(model_string: str, **kwargs) -> "FakeAutoTag": + """ + + :param model_string: + :return: + + >>> tagger = FakeAutoTag.from_model_string("") + >>> tagger.tasks + ['MODE', 'TEMPS', 'PERS', 'NOMB', 'lemma', 'pos'] + """ + return FakeAutoTag(tasks=[ + task + for _, tasks in model_spec(model_string) + for task in tasks + ], **kwargs) + + +def create_auto_tagger(module, **kwargs) -> Tuple[FakeAutoTag, DataIterator, ProcessorPrototype]: + """ Create a tagger as well as the iterator """ + tagger = FakeAutoTag.from_model_string(module.Models, batch_size=16, **kwargs) + + disambiguator = getattr(module, "Disambiguator", None) + if hasattr(disambiguator, "create"): + disambiguator = disambiguator.create() + tagger.disambiguation = disambiguator + + iterator, processor = module.get_iterator_and_processor() + return tagger, iterator, processor diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index 1359afc..cadef81 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -1,10 +1,24 @@ from pie_extended.models.lasla.get import get_iterator_and_processor -from pie_extended.testing_utils import FakeTagger +from pie_extended.models import lasla +from pie_extended.testing_utils import FakeTagger, create_auto_tagger from typing import List, Tuple +import csv from unittest import TestCase +def write_crazy_file() -> str: + filename = "crazy_text_file.txt" + with open(filename, "w") as f: + f.write("""\\\\\\<1>[$@$](V)\\\\\\§ +\\\\<1>[$@$]\\\\§ +§ +\\[I]\\§ +En honor et en bien et en gran remembrançe § +Et offerant mercé, honor et celebrançe §""") + return filename + + def make_fake_data(sentences: List[str], nb_tasks: int = 9) -> List[Tuple[str, List[str]]]: return [ [ @@ -45,7 +59,7 @@ def test_consecutive_dots(self): ) self.assertEqual( result[12], - {"form": "uiduarum", "lemma": "uiduarum", "POS": "fake", "morph": "Case=fake|Numb=fake|Deg=fake|Mood=fake|" + {"form": "uiduarum", "lemma": "uiduarum", "pos": "fake", "morph": "Case=fake|Numb=fake|Deg=fake|Mood=fake|" "Tense=fake|Voice=fake|Person=fake", "treated": "uiduarum"}, "Punctuation should be reinserted and mostly should not break anything" @@ -125,3 +139,121 @@ def test_underscores(self): flatten_seen, "Seen element should not count the underscord" ) + + def test_with_fake_advanced_tagger(self): + target = write_crazy_file() + tagger, it, pr = create_auto_tagger(lasla, lower=True) + out_file = tagger.tag_file(target, it, pr) + content = [] + with open(out_file) as f: + header = [] + for line in f: + splitted = line.strip().split() + if not header: + header = splitted + continue + content.append(dict(list(zip(header, splitted)))) + + self.assertEqual( + content, + [{'token': '\\', 'lemma': '\\', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '\\'}, + {'token': '\\', 'lemma': '\\', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '\\'}, + {'token': '\\', 'lemma': '\\', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '\\'}, + {'token': '<', 'lemma': '<', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '<'}, + {'token': '1', 'lemma': 'lemma0', 'pos': 'pos0', + 'morph': 'Case=Case0|Numb=Numb0|Deg=Deg0|Mood=Mood0|Tense=Tense0|Voice=Voice0|Person=Person0', + 'treated': '1'}, + {'token': '>', 'lemma': '>', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '>'}, + {'token': '[', 'lemma': '[', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '['}, + {'token': '$', 'lemma': '$', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '$'}, + {'token': '@', 'lemma': '@', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '@'}, + {'token': '$', 'lemma': '$', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '$'}, + {'token': ']', 'lemma': ']', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': ']'}, + {'token': '(', 'lemma': '(', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '('}, + {'token': 'v', 'lemma': 'lemma1', 'pos': 'pos1', + 'morph': 'Case=Case1|Numb=Numb1|Deg=Deg1|Mood=Mood1|Tense=Tense1|Voice=Voice1|Person=Person1', + 'treated': 'u'}, + {'token': ')', 'lemma': ')', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': ')'}, + {'token': '\\', 'lemma': '\\', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '\\'}, + {'token': '\\', 'lemma': '\\', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '\\'}, + {'token': '\\', 'lemma': '\\', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '\\'}, + {'token': '§', 'lemma': '§', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '§'}, + {'token': '\\', 'lemma': '\\', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '\\'}, + {'token': '\\', 'lemma': '\\', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '\\'}, + {'token': '<', 'lemma': '<', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '<'}, + {'token': '1', 'lemma': 'lemma2', 'pos': 'pos2', + 'morph': 'Case=Case2|Numb=Numb2|Deg=Deg2|Mood=Mood2|Tense=Tense2|Voice=Voice2|Person=Person2', + 'treated': '1'}, + {'token': '>', 'lemma': '>', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '>'}, + {'token': '[', 'lemma': '[', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '['}, + {'token': '$', 'lemma': '$', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '$'}, + {'token': '@', 'lemma': '@', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '@'}, + {'token': '$', 'lemma': '$', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '$'}, + {'token': ']', 'lemma': ']', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': ']'}, + {'token': '\\', 'lemma': '\\', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '\\'}, + {'token': '\\', 'lemma': '\\', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '\\'}, + {'token': '§', 'lemma': '§', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '§'}, + {'token': '§', 'lemma': '§', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '§'}, + {'token': '\\', 'lemma': '\\', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '\\'}, + {'token': '[', 'lemma': '[', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '['}, + {'token': 'i', 'lemma': 'lemma3', 'pos': 'pos3', + 'morph': 'Case=Case3|Numb=Numb3|Deg=Deg3|Mood=Mood3|Tense=Tense3|Voice=Voice3|Person=Person3', + 'treated': 'i'}, + {'token': ']', 'lemma': ']', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': ']'}, + {'token': '\\', 'lemma': '\\', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '\\'}, + {'token': '§', 'lemma': '§', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '§'}, + {'token': 'en', 'lemma': 'lemma4', 'pos': 'pos4', + 'morph': 'Case=Case4|Numb=Numb4|Deg=Deg4|Mood=Mood4|Tense=Tense4|Voice=Voice4|Person=Person4', + 'treated': 'en'}, + {'token': 'honor', 'lemma': 'lemma5', 'pos': 'pos5', + 'morph': 'Case=Case5|Numb=Numb5|Deg=Deg5|Mood=Mood5|Tense=Tense5|Voice=Voice5|Person=Person5', + 'treated': 'honor'}, + {'token': 'et', 'lemma': 'lemma6', 'pos': 'pos6', + 'morph': 'Case=Case6|Numb=Numb6|Deg=Deg6|Mood=Mood6|Tense=Tense6|Voice=Voice6|Person=Person6', + 'treated': 'et'}, + {'token': 'en', 'lemma': 'lemma7', 'pos': 'pos7', + 'morph': 'Case=Case7|Numb=Numb7|Deg=Deg7|Mood=Mood7|Tense=Tense7|Voice=Voice7|Person=Person7', + 'treated': 'en'}, + {'token': 'bie', 'lemma': 'lemma8', 'pos': 'pos8', + 'morph': 'Case=Case8|Numb=Numb8|Deg=Deg8|Mood=Mood8|Tense=Tense8|Voice=Voice8|Person=Person8', + 'treated': 'bie'}, + {'token': '-ne', 'lemma': 'ne2', 'pos': 'pos9', + 'morph': 'Case=Case9|Numb=Numb9|Deg=Deg9|Mood=Mood9|Tense=Tense9|Voice=Voice9|Person=Person9', + 'treated': '-ne'}, + {'token': 'et', 'lemma': 'lemma10', 'pos': 'pos10', + 'morph': 'Case=Case10|Numb=Numb10|Deg=Deg10|Mood=Mood10|Tense=Tense10|Voice=Voice10|Person=Person10', + 'treated': 'et'}, + {'token': 'en', 'lemma': 'lemma11', 'pos': 'pos11', + 'morph': 'Case=Case11|Numb=Numb11|Deg=Deg11|Mood=Mood11|Tense=Tense11|Voice=Voice11|Person=Person11', + 'treated': 'en'}, + {'token': 'gra', 'lemma': 'lemma12', 'pos': 'pos12', + 'morph': 'Case=Case12|Numb=Numb12|Deg=Deg12|Mood=Mood12|Tense=Tense12|Voice=Voice12|Person=Person12', + 'treated': 'gra'}, + {'token': '-ne', 'lemma': 'ne2', 'pos': 'pos13', + 'morph': 'Case=Case13|Numb=Numb13|Deg=Deg13|Mood=Mood13|Tense=Tense13|Voice=Voice13|Person=Person13', + 'treated': '-ne'}, + {'token': 'remembrançe', 'lemma': 'lemma14', 'pos': 'pos14', + 'morph': 'Case=Case14|Numb=Numb14|Deg=Deg14|Mood=Mood14|Tense=Tense14|Voice=Voice14|Person=Person14', + 'treated': 'remembrançe'}, + {'token': '§', 'lemma': '§', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '§'}, + {'token': 'et', 'lemma': 'lemma15', 'pos': 'pos15', + 'morph': 'Case=Case15|Numb=Numb15|Deg=Deg15|Mood=Mood15|Tense=Tense15|Voice=Voice15|Person=Person15', + 'treated': 'et'}, + {'token': 'offerant', 'lemma': 'lemma16', 'pos': 'pos16', + 'morph': 'Case=Case16|Numb=Numb16|Deg=Deg16|Mood=Mood16|Tense=Tense16|Voice=Voice16|Person=Person16', + 'treated': 'offerant'}, + {'token': 'mercé', 'lemma': 'lemma17', 'pos': 'pos17', + 'morph': 'Case=Case17|Numb=Numb17|Deg=Deg17|Mood=Mood17|Tense=Tense17|Voice=Voice17|Person=Person17', + 'treated': 'mercé'}, + {'token': ',', 'lemma': ',', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': ','}, + {'token': 'honor', 'lemma': 'lemma18', 'pos': 'pos18', + 'morph': 'Case=Case18|Numb=Numb18|Deg=Deg18|Mood=Mood18|Tense=Tense18|Voice=Voice18|Person=Person18', + 'treated': 'honor'}, + {'token': 'et', 'lemma': 'lemma19', 'pos': 'pos19', + 'morph': 'Case=Case19|Numb=Numb19|Deg=Deg19|Mood=Mood19|Tense=Tense19|Voice=Voice19|Person=Person19', + 'treated': 'et'}, + {'token': 'celebrançe', 'lemma': 'lemma20', 'pos': 'pos20', + 'morph': 'Case=Case20|Numb=Numb20|Deg=Deg20|Mood=Mood20|Tense=Tense20|Voice=Voice20|Person=Person20', + 'treated': 'celebrançe'}, + {'token': '§', 'lemma': '§', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '§'}] + )