From 5f64babd02271f8b2264bec5208c142c12c196ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Thu, 20 Feb 2020 16:44:16 +0100 Subject: [PATCH 01/13] (Software API) Huge start of a rework to make PostProcessing easier and available outside. Partial answer to #6 --- pie_extended/cli/sub.py | 4 +- pie_extended/models/fro/__init__.py | 4 +- pie_extended/models/fro/classes.py | 79 +++++++------------ pie_extended/models/lasla/__init__.py | 2 +- pie_extended/models/lasla/classes.py | 60 ++++++++------ pie_extended/pipeline/formatters/glue.py | 60 -------------- pie_extended/pipeline/formatters/proto.py | 25 +++++- .../pipeline/postprocessor/__init__.py | 0 .../pipeline/postprocessor/disambiguator.py | 22 ++++++ pie_extended/pipeline/postprocessor/glue.py | 52 ++++++++++++ pie_extended/pipeline/postprocessor/memory.py | 29 +++++++ pie_extended/pipeline/postprocessor/proto.py | 79 +++++++++++++++++++ .../pipeline/postprocessor/rulebased.py | 21 +++++ pie_extended/tagger.py | 56 ++++++------- tests/test_models/test_lasla.py | 10 +-- 15 files changed, 318 insertions(+), 185 deletions(-) delete mode 100644 pie_extended/pipeline/formatters/glue.py create mode 100644 pie_extended/pipeline/postprocessor/__init__.py create mode 100644 pie_extended/pipeline/postprocessor/disambiguator.py create mode 100644 pie_extended/pipeline/postprocessor/glue.py create mode 100644 pie_extended/pipeline/postprocessor/memory.py create mode 100644 pie_extended/pipeline/postprocessor/proto.py create mode 100644 pie_extended/pipeline/postprocessor/rulebased.py diff --git a/pie_extended/cli/sub.py b/pie_extended/cli/sub.py index 3f1f917..13c2e56 100644 --- a/pie_extended/cli/sub.py +++ b/pie_extended/cli/sub.py @@ -50,8 +50,8 @@ def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None) def tag_file(model: str, tagger: ExtensibleTagger, fpath): module = get_model(model) - iterator, formatter = getattr(module, "get_iterator_and_formatter")() - tagger.tag_file(fpath, iterator=iterator, formatter_class=formatter) + iterator, processor = getattr(module, "get_iterator_and_processor")() + tagger.tag_file(fpath, iterator=iterator, processor=processor) return True diff --git a/pie_extended/models/fro/__init__.py b/pie_extended/models/fro/__init__.py index b2d3ad9..63da07b 100644 --- a/pie_extended/models/fro/__init__.py +++ b/pie_extended/models/fro/__init__.py @@ -1,5 +1,5 @@ -from ...utils import Metadata, File ,get_path -from .classes import get_iterator_and_formatter +from ...utils import Metadata, File, get_path +from .classes import get_iterator_and_processor from ...pipeline.iterators.proto import DataIterator DESC = Metadata( diff --git a/pie_extended/models/fro/classes.py b/pie_extended/models/fro/classes.py index 99fff84..01abff4 100644 --- a/pie_extended/models/fro/classes.py +++ b/pie_extended/models/fro/classes.py @@ -1,8 +1,11 @@ import regex as re -from typing import List -from ...pipeline.formatters.glue import GlueFormatter as SourceGlueFormatter +from typing import List, Dict from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer as SourceMemorizingTokenizer from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.postprocessor.disambiguator import DisambiguatorProcessor +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor +from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor +from pie_extended.pipeline.postprocessor.glue import GlueProcessor # Uppercase regexp _uppercase = re.compile("^[A-ZÉÈÀÂÊÎÔÛŶÄËÏÖÜŸ]$") @@ -86,67 +89,41 @@ def _normalizer(self, data: str): return data -class GlueFormatter(SourceGlueFormatter): - HEADERS = ["form", "lemma", "POS", "morph", "treated_token"] - MORPH_PART = ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"] - +class FroRulesProcessor(RuleBasedProcessor): PONCTU = re.compile(r"^\W+$") NUMBER = re.compile(r"\d+") PONFORT = [".", "...", "!", "?"] - def __init__(self, tokenizer_memory: MemorizingTokenizer): - super(GlueFormatter, self).__init__(tokenizer_memory=tokenizer_memory) - - def rule_based(cls, token): - if cls.PONCTU.match(token): - lemma = token - if token in GlueFormatter.PONFORT: + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + token = annotation["form"] + if self.PONCTU.match(token): + if token in self.PONFORT: pos = "PONfrt" else: pos = "PONfbl" - return [token, lemma, pos, "MORPH=empty", token] - - def format_line(self, token, tags, ignored=False): - tags = list(tags) - lemma = tags[self.tasks.index("lemma")] - index, input_token, out_token = self.tokenizer_memory.tokens.pop(0) - - if token != out_token: - raise Exception("The output token does not match our inputs %s : %s" % (token, out_token)) - - overwriten = self.rule_based(out_token) - - if overwriten: - return overwriten - - if type(self).NUMBER.match(token): # This would push for sending the whole elements to rule_based and - # not the token only - lemma = token - tags[self.tasks.index(self.pos_tag)] = "ADJcar" - - return [ - input_token, - lemma, - tags[self.tasks.index(self.pos_tag)], - "|".join( - "{cat}={tag}".format( - cat=morph_part, - tag=tags[self.tasks.index(morph_part.replace(".", ""))] - ) - for morph_part in GlueFormatter.MORPH_PART - if morph_part.replace(".", "") in self.tasks and - tags[self.tasks.index(morph_part.replace(".", ""))] != "_" - ) or "MORPH=empty", - out_token - ] + return {"form": token, "lemma": token, "POS": pos, "morph": "MORPH=empty"} + elif self.NUMBER.match(token): + annotation["pos"] = "ADJcar" + return annotation + +class FroGlueProcessor(GlueProcessor): + OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] + GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]} + MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."} -def get_iterator_and_formatter(): + +def get_iterator_and_processor(): tokenizer = MemorizingTokenizer() - formatter = GlueFormatter(tokenizer) + processor = FroRulesProcessor( + MemoryzingProcessor( + tokenizer_memory=tokenizer, + head_processor=FroGlueProcessor() + ) + ) iterator = DataIterator( tokenizer=tokenizer, remove_from_input=DataIterator.remove_punctuation ) - return iterator, formatter + return iterator, processor diff --git a/pie_extended/models/lasla/__init__.py b/pie_extended/models/lasla/__init__.py index 05f68de..7074413 100644 --- a/pie_extended/models/lasla/__init__.py +++ b/pie_extended/models/lasla/__init__.py @@ -1,2 +1,2 @@ from pie_extended.models.lasla.consts import DOWNLOADS, Models, Disambiguator, addons, DESC -from pie_extended.models.lasla.classes import get_iterator_and_formatter +from pie_extended.models.lasla.classes import get_iterator_and_processor diff --git a/pie_extended/models/lasla/classes.py b/pie_extended/models/lasla/classes.py index 7276346..3599be8 100644 --- a/pie_extended/models/lasla/classes.py +++ b/pie_extended/models/lasla/classes.py @@ -13,10 +13,35 @@ from pie_extended.pipeline.iterators.proto import DataIterator -from pie_extended.pipeline.formatters.glue import GlueFormatter as SourceGlueFormatter +from pie_extended.pipeline.postprocessor.disambiguator import DisambiguatorProcessor +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor +from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor +from pie_extended.pipeline.postprocessor.glue import GlueProcessor from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer as SourceMemorizingTokenizer +from typing import Dict +class LatinRulesProcessor(RuleBasedProcessor): + PONCTU = re.compile(r"^\W+$") + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + # If Else condition + token = annotation["form"] + if self.PONCTU.match(token): + return {"form": token, "lemma": token, "POS": "PUNC", "morph": "MORPH=empty"} + elif token.startswith("-"): + if token == "-ne": + annotation["lemma"] = "ne2" + else: + annotation["lemma"] = "ne" + return annotation + + +class LatinGlueProcessor(GlueProcessor): + OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] + GLUE = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]} + WHEN_EMPTY = {"morph": "MORPH=empty"} + # Uppercase regexp uppercase = re.compile(r"^[A-Z]$") @@ -74,33 +99,16 @@ def normalizer(self, data: str): return data -class GlueFormatter(SourceGlueFormatter): - HEADERS = ["form", "lemma", "POS", "morph", "treated_token"] - MORPH_PART = ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"] - PONCTU = re.compile(r"^\W+$") - - def __init__(self, tokenizer_memory): - super(GlueFormatter, self).__init__([]) - self.tokenizer_memory = tokenizer_memory - - def rule_based(cls, token): - if cls.PONCTU.match(token): - return [token, token, "PUNC", "MORPH=empty", token] - elif token.startswith("-"): - if token == "-ne": - lemma = "ne2" - else: - lemma = token[1:] - return [token, lemma, "CONcoo", "MORPH=empty", token] - - return None - - -def get_iterator_and_formatter(): +def get_iterator_and_processor(): tokenizer = MemorizingTokenizer() - formatter = GlueFormatter(tokenizer) + processor = LatinRulesProcessor( + MemoryzingProcessor( + tokenizer_memory=tokenizer, + head_processor=LatinGlueProcessor() + ) + ) iterator = DataIterator( tokenizer=tokenizer, remove_from_input=DataIterator.remove_punctuation ) - return iterator, formatter + return iterator, processor diff --git a/pie_extended/pipeline/formatters/glue.py b/pie_extended/pipeline/formatters/glue.py deleted file mode 100644 index b1024dd..0000000 --- a/pie_extended/pipeline/formatters/glue.py +++ /dev/null @@ -1,60 +0,0 @@ -import regex as re -from .proto import Formatter - - -class GlueFormatter(Formatter): - """ Need replacing of morph_part for specific corpora - - """ - - HEADERS = ["form", "lemma", "POS", "morph", "treated_token"] - MORPH_PART = ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"] - PONCTU = re.compile(r"^\W+$") - - def __init__(self, tokenizer_memory): - super(GlueFormatter, self).__init__([]) - self.tokenizer_memory = tokenizer_memory - - def __call__(self, tasks): - super(GlueFormatter, self).__init__(tasks) - self.pos_tag = "POS" - if "POS" not in self.tasks and "pos" in self.tasks: - self.pos_tag = "pos" - return self - - @classmethod - def get_headers(cls): - return cls.HEADERS - - def rule_based(cls, token): - if cls.PONCTU.match(token): - return [token, token, "PUNC", "MORPH=empty", token] - - return None - - def format_line(self, token, tags, ignored=False): - tags = list(tags) - lemma = tags[self.tasks.index("lemma")] - index, input_token, out_token = self.tokenizer_memory.tokens.pop(0) - if token != out_token: - raise Exception("The output token does not match our inputs %s : %s" % (token, out_token)) - - overwriten = self.rule_based(token) - if overwriten: - return overwriten - - return [ - input_token, - lemma, - tags[self.tasks.index(self.pos_tag)], - "|".join( - "{cat}={tag}".format( - cat=morph_part, - tag=tags[self.tasks.index(morph_part)] - ) - for morph_part in type(self).MORPH_PART - if morph_part in self.tasks and - tags[self.tasks.index(morph_part)] != "_" - ) or "MORPH=empty", - out_token - ] diff --git a/pie_extended/pipeline/formatters/proto.py b/pie_extended/pipeline/formatters/proto.py index f4b2c2d..ac69acc 100644 --- a/pie_extended/pipeline/formatters/proto.py +++ b/pie_extended/pipeline/formatters/proto.py @@ -1,13 +1,30 @@ -from typing import List, Iterable +from typing import List, Iterable, Callable, Dict +import sys class Formatter: # Default is TSV + """ The CSV formatter necessarily starts with form in its header. + + """ + format_line: Callable[[Dict[str, str]], List[str]] + def __init__(self, tasks: List[str]): self.tasks: List[str] = tasks - def format_line(self, token: str, tags: Iterable[str], ignored=False) -> List[str]: - """ Format the tags""" - return [token] + list(tags) + if sys.version_info.minor <= 6: + # Before 3.7, order of dictionary is not guaranteed + # Cf. https://mail.python.org/pipermail/python-dev/2017-December/151283.html + self.format_line = self.format_line_3_6 + else: + self.format_line = self.format_line_3_7 + + def format_line_3_6(self, annotation: Dict[str, str]) -> List[str]: + """ Format the tags """ + return [annotation["form"]] + [annotation[task] for task in self.tasks] + + def format_line_3_7(self, annotation: Dict[str, str]) -> List[str]: + """ Format the tags """ + return list(annotation.values()) def write_line(self, formatted): return "\t".join(formatted) + "\r\n" diff --git a/pie_extended/pipeline/postprocessor/__init__.py b/pie_extended/pipeline/postprocessor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pie_extended/pipeline/postprocessor/disambiguator.py b/pie_extended/pipeline/postprocessor/disambiguator.py new file mode 100644 index 0000000..79d0895 --- /dev/null +++ b/pie_extended/pipeline/postprocessor/disambiguator.py @@ -0,0 +1,22 @@ +from ..disambiguators.proto import Disambiguator +from .proto import ProcessorPrototype, ChainedProcessor +from typing import Optional, Dict, List + + +# Right now disambiguation is applied at the sentence level. Question is should we ? +# Keeping that here for the moment + +class DisambiguatorProcessor(ChainedProcessor): + """ Applies rules found in rules(token_annotation) + + """ + + def __init__(self, disambiguator: Disambiguator, head_processor: Optional[ProcessorPrototype], **kwargs): + super(DisambiguatorProcessor, self).__init__(head_processor=head_processor, **kwargs) + self.disambiguator: Disambiguator = disambiguator + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + return annotation + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + return self.rules(self.head_processor.get_dict(token, tags)) \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/glue.py b/pie_extended/pipeline/postprocessor/glue.py new file mode 100644 index 0000000..171d406 --- /dev/null +++ b/pie_extended/pipeline/postprocessor/glue.py @@ -0,0 +1,52 @@ +from .proto import ProcessorPrototype, RenamedTaskProcessor +from typing import Generator, Dict, List + + +class GlueProcessor(RenamedTaskProcessor): + """ Glues together specific tasks + + """ + + # Output keys are keys that are given in the end + OUTPUT_KEYS: List[str] = ["form", "lemma", "POS", "morph"] + # Glue dicts contains tasks that should merge together subtasks + GLUE: Dict[str, List[str]] = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]} + # Glue_char is what is used to glue things together -> Tense=Pres|Person=1 + GLUE_CHAR: str = "|" + # Glue Empty are value to take when all things glued together are empty + GLUE_EMPTY: Dict[str, str] = {"morph": "MORPH=empty"} + + def __init__(self): + super(GlueProcessor, self).__init__() + + # Sets-up some copy of the values + self._out = type(self).OUTPUT_KEYS + self._glue = type(self).GLUE + self._glue_char = type(self).GLUE_CHAR + self._glue_empty = type(self).GLUE_EMPTY + + def set_tasks(self, tasks): + super(GlueProcessor, self).set_tasks(tasks) + + def _yield_annotation( + self, + token_dict: Dict[str, str] + ) -> Generator[str, None, None]: + # For each key we should return + print(self.tasks) + for head in self._out: + if head not in self._glue: + yield head, token_dict[head] + else: + # Otherwise, we glue together things that should be glued together + joined = self._glue_char.join([token_dict[glued_task] for glued_task in self._glue[head]]) + if not joined: + joined = self._glue_empty[head] + yield head, joined + + def reinsert(self, form: str) -> Dict[str, str]: + return dict(form=form, **{key: self.empty_value for key in self._out if key != "form"}) + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + as_dict = super(GlueProcessor, self).get_dict(token, tags) + return dict(self._yield_annotation(as_dict)) diff --git a/pie_extended/pipeline/postprocessor/memory.py b/pie_extended/pipeline/postprocessor/memory.py new file mode 100644 index 0000000..83d7801 --- /dev/null +++ b/pie_extended/pipeline/postprocessor/memory.py @@ -0,0 +1,29 @@ +from .proto import ProcessorPrototype, ChainedProcessor +from typing import Optional, Dict, List +if "typing" == "nottyping": + from ..tokenizers.memorizing import MemorizingTokenizer + + +class MemoryzingProcessor(ChainedProcessor): + """ MemoryzingProcessor proposes to keep track of changes operated on input string + by reinserting the original data alongside a new task (KEY) where we output + the input seen by the Model + + """ + KEY: str = "treated" + + def __init__(self, tokenizer_memory: "MemorizingTokenizer", head_processor: Optional[ProcessorPrototype], **kwargs): + super(MemoryzingProcessor, self).__init__(head_processor=head_processor, **kwargs) + self.memory: "MemorizingTokenizer" = tokenizer_memory + self._key: str = type(self).KEY + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + # First we get the dictionary + token_dict = self.head_processor.get_dict(token, tags) + index, input_token, out_token = self.memory.tokens.pop(0) + if token != out_token: + raise Exception("The output token does not match our inputs %s : %s" % (token, out_token)) + + token_dict[self._key] = out_token + token_dict["form"] = input_token + return token_dict \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/proto.py b/pie_extended/pipeline/postprocessor/proto.py new file mode 100644 index 0000000..dd1aefb --- /dev/null +++ b/pie_extended/pipeline/postprocessor/proto.py @@ -0,0 +1,79 @@ +from typing import List, Dict, Optional, Type + +DEFAULT_EMPTY = "_" + + +class ProcessorPrototype: + tasks: List[str] + empty_value: str + + def __init__(self, empty_value: Optional[str] = None): + self.tasks = [] + self.empty_value = empty_value or DEFAULT_EMPTY + + def set_tasks(self, tasks): + self.tasks = tasks + + def postprocess(self, line): + pass + + def reinsert(self, form: str) -> Dict[str, str]: + """ Generates an automatic line for a token that was removed from lemmatization + + :param form: Token to reinsert + :return: Dictionary representation of the token, as an annotation + """ + return dict(form=form, **{task: self.empty_value for task in self.tasks}) + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + """ Get the dictionary representation of a token annotation + + :param token: + :param tags: + :return: + """ + return dict(form=token, **dict(zip(self.tasks, tags))) + + def reset(self): + """ Functions that should be run in between documents """ + pass + + +class RenamedTaskProcessor(ProcessorPrototype): + MAP: Dict[str, str] + + def __init__(self, **kwargs): + super(RenamedTaskProcessor, self).__init__(**kwargs) + self._map: Dict[str, str] = type(self).MAP + + def set_tasks(self, tasks): + return [self._map.get(task, task) for task in tasks] + + +class ChainedProcessor(ProcessorPrototype): + """ Allows for easy chaining ! + + ChainedProcessor(ProcessorPrototype) basically should behave like a normal processor + + """ + head_processor: ProcessorPrototype + + def __init__(self, head_processor: Optional[ProcessorPrototype], **kwargs): + super(ChainedProcessor, self).__init__(**kwargs) + + self.head_processor: ProcessorPrototype = head_processor + if not self.head_processor: + self.head_processor = ProcessorPrototype() + + def set_tasks(self, tasks): + super(ChainedProcessor, self).set_tasks(tasks) + self.head_processor.set_tasks(tasks) + + def reinsert(self, form: str) -> Dict[str, str]: + return self.head_processor.reinsert(form) + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + return self.head_processor.get_dict(token, tags) + + def reset(self): + self.head_processor.reset() \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/rulebased.py b/pie_extended/pipeline/postprocessor/rulebased.py new file mode 100644 index 0000000..279f97e --- /dev/null +++ b/pie_extended/pipeline/postprocessor/rulebased.py @@ -0,0 +1,21 @@ +from .proto import ProcessorPrototype, ChainedProcessor +from typing import Optional, Dict, List +if "typing" == "nottyping": + from ..tokenizers.memorizing import MemorizingTokenizer + + +class RuleBasedProcessor(ChainedProcessor): + """ Applies rules found in rules(token_annotation) + + """ + KEY: str = "treated" + + def __init__(self, head_processor: Optional[ProcessorPrototype], **kwargs): + super(RuleBasedProcessor, self).__init__(head_processor=head_processor, **kwargs) + self._key: str = type(self).KEY + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + return annotation + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + return self.rules(self.head_processor.get_dict(token, tags)) \ No newline at end of file diff --git a/pie_extended/tagger.py b/pie_extended/tagger.py index 8c2dc9d..9be9a01 100644 --- a/pie_extended/tagger.py +++ b/pie_extended/tagger.py @@ -1,5 +1,5 @@ import os -from typing import Optional +from typing import Optional, Dict, Generator, Type from pie.tagger import Tagger from pie import utils @@ -7,6 +7,7 @@ from .pipeline.formatters.proto import Formatter from .pipeline.disambiguators.proto import Disambiguator from .pipeline.iterators.proto import DataIterator +from .pipeline.postprocessor.proto import ProcessorPrototype class ExtensibleTagger(Tagger): @@ -30,7 +31,7 @@ def reinsert_full(self, formatter, sent_reinsertion, tasks): ) yield formatter.write_sentence_end() - def tag_file(self, fpath: str, iterator: DataIterator, formatter_class: type): + def tag_file(self, fpath: str, iterator: DataIterator, processor: ProcessorPrototype): # Read content of the file with open(fpath) as f: data = f.read() @@ -38,16 +39,15 @@ def tag_file(self, fpath: str, iterator: DataIterator, formatter_class: type): _, ext = os.path.splitext(fpath) with open(utils.ensure_ext(fpath, ext, 'pie'), 'w+') as f: - for line in self.iter_tag(data, iterator, formatter_class): + for line in self.iter_tag(data, iterator, processor=processor): f.write(line) - def tag_str(self, data: str, iterator: DataIterator, formatter_class: type) -> str: - return "".join(list(self.iter_tag(data, iterator, formatter_class))) - - def iter_tag(self, data: str, iterator: DataIterator, formatter_class: type): - header = False - formatter = None + def tag_str(self, data: str, iterator: DataIterator, processor: ProcessorPrototype) -> str: + return list(self.iter_tag_token(data, iterator, processor=processor)) + def iter_tag_token(self, data: str, iterator: DataIterator, processor: ProcessorPrototype) \ + -> Generator[Dict[str, str], None, None]: + processor.reset() for chunk in utils.chunks( iterator(data, lower=self.lower), size=self.batch_size): @@ -61,7 +61,8 @@ def iter_tag(self, data: str, iterator: DataIterator, formatter_class: type): sents=[sent for sent in sents if sent], lengths=lengths ) - formatter: Formatter = formatter_class(tasks) + if not processor.tasks: + processor.set_tasks(tasks) # We keep a real sentence index for sents_index, sent_is_empty in enumerate(is_empty): @@ -73,13 +74,6 @@ def iter_tag(self, data: str, iterator: DataIterator, formatter_class: type): # Gets things that needs to be reinserted sent_reinsertion = needs_reinsertion[sents_index] - # If the header has not yet be written, write it - if not header: - yield formatter.write_headers() - header = True - - yield formatter.write_sentence_beginning() - # If we have a disambiguator, we run the results into it if self.disambiguation: sent = self.disambiguation(sent, tasks) @@ -88,29 +82,23 @@ def iter_tag(self, data: str, iterator: DataIterator, formatter_class: type): for index, (token, tags) in enumerate(sent): while reinsertion_index + index in sent_reinsertion: - yield formatter.write_line( - formatter.format_line( - token=sent_reinsertion[reinsertion_index + index], - tags=[""] * len(tasks) - ) - ) + yield processor.reinsert(sent_reinsertion[reinsertion_index+index]) del sent_reinsertion[reinsertion_index + index] reinsertion_index += 1 - yield formatter.write_line( - formatter.format_line(token, tags) - ) + yield processor.get_dict(token, tags) for reinsertion in sorted(list(sent_reinsertion.keys())): - yield formatter.write_line( - formatter.format_line( - token=sent_reinsertion[reinsertion], - tags=[""] * len(tasks) - ) - ) + yield processor.reinsert(sent_reinsertion[reinsertion]) - yield formatter.write_sentence_end() + def iter_tag(self, data: str, iterator: DataIterator, processor: type): + formatter = None + for annotation in self.iter_tag_token(data, iterator, processor): + if not formatter: + formatter = Formatter(list(annotation.keys())) + yield formatter.write_headers() + yield formatter.write_line(formatter) if formatter: - yield formatter.write_footer() + yield formatter.write_footer() \ No newline at end of file diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index 2a5cac5..5527508 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -40,7 +40,7 @@ def test_consecutive_dots(self): result = tagger.tag_str( data="id enim ait turbabuntur a facie eius patris or phanorum et iudicis uiduarum ." " . causam turbationis hanc docuit quod pater", - formatter_class=formatter, + postprocessing_class=formatter, iterator=data_iterator ) self.assertIn( @@ -62,7 +62,7 @@ def test_leading_punctuation(self): ]) result = tagger.tag_str( "( id enim ait ) turbabuntur a facie eius patris or phanorum et iudicis uiduarum . .", - formatter_class=formatter, + postprocessing_class=formatter, iterator=data_iterator ) self.assertIn( @@ -87,7 +87,7 @@ def test_punctuation_is_not_seen(self): ]) tagger.tag_str( "( id enim ait ) turbabuntur a facie eius patris or phanorum et iudicis uiduarum . .", - formatter_class=formatter, + postprocessing_class=formatter, iterator=data_iterator ) self.assertNotIn( @@ -105,7 +105,7 @@ def test_j_are_temporarly_replaced(self): ]) result = tagger.tag_str( "( id enim ait ) turbabuntur a facie eius patris or phanorum et judicis uiduarum . .", - formatter_class=formatter, + postprocessing_class=formatter, iterator=data_iterator ) flatten_seen = list([tok for sent in tagger.seen for tok in sent]) @@ -121,7 +121,7 @@ def test_underscores(self): ]) tagger.tag_str( string, - formatter_class=formatter, + postprocessing_class=formatter, iterator=data_iterator ) flatten_seen = list([tok for sent in tagger.seen for tok in sent]) From 29e8e4eaf2204a41b22746f0380b1083b80c4e80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Thu, 20 Feb 2020 16:54:22 +0100 Subject: [PATCH 02/13] Where are my tasks gone --- pie_extended/pipeline/postprocessor/proto.py | 5 ++++- pie_extended/tagger.py | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pie_extended/pipeline/postprocessor/proto.py b/pie_extended/pipeline/postprocessor/proto.py index dd1aefb..4a615f1 100644 --- a/pie_extended/pipeline/postprocessor/proto.py +++ b/pie_extended/pipeline/postprocessor/proto.py @@ -13,6 +13,7 @@ def __init__(self, empty_value: Optional[str] = None): def set_tasks(self, tasks): self.tasks = tasks + print(tasks, self.tasks) def postprocess(self, line): pass @@ -32,7 +33,9 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: :param tags: :return: """ - return dict(form=token, **dict(zip(self.tasks, tags))) + print("Do I have task ?", self.tasks) + print({"form":token, **{k: val for k, val in zip(self.tasks, tags)}}) + return {"form":token, **{k: val for k, val in zip(self.tasks, tags)}} def reset(self): """ Functions that should be run in between documents """ diff --git a/pie_extended/tagger.py b/pie_extended/tagger.py index 9be9a01..46741e9 100644 --- a/pie_extended/tagger.py +++ b/pie_extended/tagger.py @@ -61,8 +61,7 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor sents=[sent for sent in sents if sent], lengths=lengths ) - if not processor.tasks: - processor.set_tasks(tasks) + processor.set_tasks(tasks) # We keep a real sentence index for sents_index, sent_is_empty in enumerate(is_empty): From 51734a3e6ec2d4808b65f5a0cb6d45c1b4acc269 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Thu, 20 Feb 2020 19:37:52 +0100 Subject: [PATCH 03/13] (Architecture Rework) Completely reworked tokenizer --- pie_extended/models/fro/classes.py | 50 +++---- pie_extended/models/lasla/classes.py | 131 +++++++++++------- pie_extended/pipeline/iterators/proto.py | 16 +-- pie_extended/pipeline/postprocessor/glue.py | 5 +- pie_extended/pipeline/postprocessor/memory.py | 6 +- pie_extended/pipeline/postprocessor/proto.py | 9 +- .../pipeline/tokenizers/memorizing.py | 67 ++++----- .../pipeline/tokenizers/simple_tokenizer.py | 33 +++++ pie_extended/tagger.py | 9 +- tests/test_models/test_lasla.py | 26 ++-- 10 files changed, 201 insertions(+), 151 deletions(-) create mode 100644 pie_extended/pipeline/tokenizers/simple_tokenizer.py diff --git a/pie_extended/models/fro/classes.py b/pie_extended/models/fro/classes.py index 01abff4..482b696 100644 --- a/pie_extended/models/fro/classes.py +++ b/pie_extended/models/fro/classes.py @@ -1,14 +1,11 @@ import regex as re -from typing import List, Dict -from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer as SourceMemorizingTokenizer +from typing import List, Dict, Generator +from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer from pie_extended.pipeline.iterators.proto import DataIterator -from pie_extended.pipeline.postprocessor.disambiguator import DisambiguatorProcessor from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor from pie_extended.pipeline.postprocessor.glue import GlueProcessor -# Uppercase regexp -_uppercase = re.compile("^[A-ZÉÈÀÂÊÎÔÛŶÄËÏÖÜŸ]$") _Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“" _Dots_collections = r"[" + _Dots_except_apostrophe + "‘’]" @@ -20,7 +17,7 @@ r"(?:XC|XL|L?X{0,3})(?:IX|I?V|V?I{1,3}))" -class MemorizingTokenizer(SourceMemorizingTokenizer): +class FroMemorizingTokenizer(MemorizingTokenizer): re_add_space_around_punct = re.compile(r"(\s*)(\.+[^\w\s\'’ʼ])(\s*)") re_add_space_around_apostrophe_that_are_quotes = re.compile( r"((((?<=[\W])[\'’ʼ]+(?=[\W]))|((?<=[\w])[\'’ʼ]+(?=[\W]))|((?<=[\W])[\'’ʼ]+(?=[\w]))))" @@ -35,11 +32,7 @@ class MemorizingTokenizer(SourceMemorizingTokenizer): roman_number_dot = re.compile(r"\.(" + _RomanNumber + r")\.") def __init__(self): - super(MemorizingTokenizer, self).__init__( - sentence_tokenizer=self._sentence_tokenizer, - word_tokenizer=self._word_tokenizer, - normalizer=self._normalizer - ) + super(FroMemorizingTokenizer, self).__init__() self.tokens = [] @staticmethod @@ -54,25 +47,22 @@ def _real_sentence_tokenizer(cls, string: str) -> List[str]: string = string.replace("_DOT_", ".") return string.split("") - @staticmethod - def _word_tokenizer(data): - # ICI, il faut que tu tokenizes toi-meme avec une fonction à toi - return data.split() + def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + if lower: + text = text.lower() + text = text.split() + return text - def _sentence_tokenizer(self, data): + def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]: sentences = list() - data = self.normalizer(data) + data = self.normalizer(text) for sent in self._real_sentence_tokenizer(data): sent = sent.strip() if sent: - sentences.append(sent) + sentences.append(self.word_tokenizer(sent)) yield from sentences - def _replacer(self, inp: str): - out = self.re_remove_ending_apostrophe.sub("", inp) - return out - - def _normalizer(self, data: str): + def normalizer(self, data: str) -> str: data = self.re_remove_ending_apostrophe.sub( r"\g<1> ", self.re_add_space_around_apostrophe_that_are_quotes.sub( @@ -90,6 +80,9 @@ def _normalizer(self, data: str): class FroRulesProcessor(RuleBasedProcessor): + """ Fro Dataset has not all punctuation signs in it, we remove it and posttag it automatically + + """ PONCTU = re.compile(r"^\W+$") NUMBER = re.compile(r"\d+") PONFORT = [".", "...", "!", "?"] @@ -106,15 +99,24 @@ def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: annotation["pos"] = "ADJcar" return annotation + def __init__(self, *args, **kwargs): + super(FroRulesProcessor, self).__init__(*args, **kwargs) + class FroGlueProcessor(GlueProcessor): + """ We glue morphological features into one column + + """ OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]} MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."} + def __init__(self, *args, **kwargs): + super(FroGlueProcessor, self).__init__(*args, **kwargs) + def get_iterator_and_processor(): - tokenizer = MemorizingTokenizer() + tokenizer = FroMemorizingTokenizer() processor = FroRulesProcessor( MemoryzingProcessor( tokenizer_memory=tokenizer, diff --git a/pie_extended/models/lasla/classes.py b/pie_extended/models/lasla/classes.py index 3599be8..fd0ad69 100644 --- a/pie_extended/models/lasla/classes.py +++ b/pie_extended/models/lasla/classes.py @@ -1,6 +1,13 @@ +from typing import Dict, List, Generator import sys import regex as re import click +from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor +from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor +from pie_extended.pipeline.postprocessor.glue import GlueProcessor +from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer +from pie_extended.models.fro.classes import _RomanNumber, _Dots_except_apostrophe, _Dots_collections try: import cltk @@ -12,23 +19,19 @@ sys.exit(0) -from pie_extended.pipeline.iterators.proto import DataIterator -from pie_extended.pipeline.postprocessor.disambiguator import DisambiguatorProcessor -from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor -from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor -from pie_extended.pipeline.postprocessor.glue import GlueProcessor -from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer as SourceMemorizingTokenizer -from typing import Dict +class LatinRulesProcessor(RuleBasedProcessor): + """ Lasla data has no punctuation, we tag it automatically. + "ne" token can be two different lemma, but I don't remember why I wrote this part. (ne/nec ?) -class LatinRulesProcessor(RuleBasedProcessor): + """ PONCTU = re.compile(r"^\W+$") def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: # If Else condition token = annotation["form"] if self.PONCTU.match(token): - return {"form": token, "lemma": token, "POS": "PUNC", "morph": "MORPH=empty"} + return {"form": token, "lemma": token, "pos": "PUNC", "morph": "MORPH=empty"} elif token.startswith("-"): if token == "-ne": annotation["lemma"] = "ne2" @@ -36,71 +39,101 @@ def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: annotation["lemma"] = "ne" return annotation + def __init__(self, *args, **kwargs): + super(LatinRulesProcessor, self).__init__(*args, **kwargs) + class LatinGlueProcessor(GlueProcessor): OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] GLUE = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]} WHEN_EMPTY = {"morph": "MORPH=empty"} + MAP = {"pos": "POS"} + + def __init__(self, *args, **kwargs): + super(LatinGlueProcessor, self).__init__(*args, **kwargs) + # Uppercase regexp uppercase = re.compile(r"^[A-Z]$") -class MemorizingTokenizer(SourceMemorizingTokenizer): - - re_add_space_around_punct = re.compile(r"(\s*)([^\w\s\.])(\s*)") - re_normalize_space = re.compile(r"(\s+)") - re_sentence_tokenizer = re.compile(r"([_||[^\s\w]]+(?:[\s_||[\W]]+)?)", re.VERSION1) +class LatMemorizingTokenizer(MemorizingTokenizer): + re_add_space_around_punct = re.compile(r"(\s*)(\.+[^\w\s\'’ʼ])(\s*)") + re_add_space_around_apostrophe_that_are_quotes = re.compile( + r"((((?<=[\W])[\'’ʼ]+(?=[\W]))|((?<=[\w])[\'’ʼ]+(?=[\W]))|((?<=[\W])[\'’ʼ]+(?=[\w]))))" + # NotLetter+Apo+NotLetter or Letter+Apo+NotLetter or NotLetter+Apo+Letter + # ?'. or manger'_ or _'Bonjour + ) + re_add_space_after_apostrophe = re.compile(r"(\s*)([\'’ʼ])(\s*)") + re_remove_ending_apostrophe = re.compile(r"(?<=\w)([\'’ʼ])") + _sentence_boundaries = re.compile( + r"([" + _Dots_except_apostrophe + r"]+\s*)+" + ) + roman_number_dot = re.compile(r"\.(" + _RomanNumber + r")\.") def __init__(self): - self.tokens = [ - ] - + super(LatMemorizingTokenizer, self).__init__() + self.tokens = [] self._word_tokenizer = WordTokenizer("latin") - def word_tokenizer(self, data): - return self._word_tokenizer.tokenize(data) - - def sentence_tokenizer(self, data): + @staticmethod + def _sentence_tokenizer_merge_matches(match): + """ Best way we found to deal with repeating groups""" + start, end = match.span() + return match.string[start:end] + "" + + @classmethod + def _real_sentence_tokenizer(cls, string: str) -> List[str]: + string = cls._sentence_boundaries.sub(cls._sentence_tokenizer_merge_matches, string) + string = string.replace("_DOT_", ".") + return string.split("") + + def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + tokenized = [tok for tok in self._word_tokenizer.tokenize(text) if tok] + if tokenized: + tokenized = [tok.lower() for tok in tokenized] + return tokenized + + def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]: sentences = list() - first_is_dot = False - started_writting = False # Allows for avoiding to compute length - for sent in MemorizingTokenizer.re_sentence_tokenizer.split(data): + data = self.normalizer(text) + for sent in self._real_sentence_tokenizer(data): sent = sent.strip() if sent: - if MemorizingTokenizer.re_sentence_tokenizer.match(sent): - if not started_writting: - sentences.append(sent) - first_is_dot = True - else: - sentences[-1] += " " + sent - else: - if first_is_dot: - sentences[-1] += " " + sent - first_is_dot = False - else: - sentences.append(sent) - - if not started_writting and len(sentences): - started_writting = True - + sentences.append(self.word_tokenizer(sent)) yield from sentences + def normalizer(self, data: str) -> str: + data = self.re_remove_ending_apostrophe.sub( + r"\g<1> ", + self.re_add_space_around_apostrophe_that_are_quotes.sub( + r" \g<2> ", + self.re_add_space_around_punct.sub( + r" \g<2> ", + self.roman_number_dot.sub( + r"_DOT_\g<1>_DOT_", + data + ) + ) + ) + ) + return data + def replacer(self, inp: str): - inp = inp.replace("U", "V").replace("v", "u").replace("J", "I").replace("j", "i").lower() + inp = inp.replace("V", "U").replace("v", "u").replace("J", "I").replace("j", "i") return inp - def normalizer(self, data: str): - # Fix regarding the current issue of apostrophe - # https://github.com/cltk/cltk/issues/925#issuecomment-522065530 - # On the other hand, it creates empty tokens... - data = MemorizingTokenizer.re_add_space_around_punct.sub(" \g<2> ", data) - data = MemorizingTokenizer.re_normalize_space.sub(" ", data) - return data + #def normalizer(self, data: str): + # # Fix regarding the current issue of apostrophe + # # https://github.com/cltk/cltk/issues/925#issuecomment-522065530 + # # On the other hand, it creates empty tokens... + # data = MemorizingTokenizer.re_add_space_around_punct.sub(" \g<2> ", data) + # data = MemorizingTokenizer.re_normalize_space.sub(" ", data) + # return data def get_iterator_and_processor(): - tokenizer = MemorizingTokenizer() + tokenizer = LatMemorizingTokenizer() processor = LatinRulesProcessor( MemoryzingProcessor( tokenizer_memory=tokenizer, diff --git a/pie_extended/pipeline/iterators/proto.py b/pie_extended/pipeline/iterators/proto.py index 8229ec5..89d0bae 100644 --- a/pie_extended/pipeline/iterators/proto.py +++ b/pie_extended/pipeline/iterators/proto.py @@ -1,23 +1,22 @@ import regex as re -import string from pie.tagger import simple_tokenizer from typing import Callable, List, Tuple, Dict, Union, Iterable -from ...pipeline.tokenizers.classes import Tokenizer from ...utils import ObjectCreator +from ..tokenizers.simple_tokenizer import SimpleTokenizer Remover = Callable[[List[str]], Tuple[List[str], Dict[int, str]]] PUNKT = re.compile(r"^[_||[^\s\w]]+$", re.VERSION1) class DataIterator: - def __init__(self, tokenizer: Union[ObjectCreator, Tokenizer] = None, remove_from_input: Callable = None): + def __init__(self, tokenizer: SimpleTokenizer = None, remove_from_input: Callable = None): """ Iterator used to parse the text and returns bits to tag :param tokenizer: Tokenizer """ - self.tokenizer = tokenizer or simple_tokenizer + self.tokenizer: SimpleTokenizer = tokenizer or SimpleTokenizer() self.remove_from_input = remove_from_input if self.remove_from_input is None: self.remove_from_input = lambda x: (x, {}) @@ -41,12 +40,6 @@ def remove_punctuation(sentence: List[str]) -> Tuple[List[str], Dict[int, str]]: clean.append(token) return clean, removed - def get_tokenizer(self) -> Tokenizer: - """ Get the tokenizer if it needs to be created""" - if isinstance(self.tokenizer, ObjectCreator): - return self.tokenizer.create() - return self.tokenizer - def get_remover(self) -> Remover: if isinstance(self.remove_from_input, ObjectCreator): return self.remove_from_input.create() @@ -60,8 +53,7 @@ def __call__(self, data: str, lower: bool = False) -> Iterable[Tuple[List[str], :param lower: Whether or not to lower the text :yields: (Sentence as a list of word, Size of the sentence, Elements removed from the sentence) """ - tokenizer = self.get_tokenizer() remover = self.get_remover() - for sentence in tokenizer(data, lower=lower): + for sentence in self.tokenizer.sentence_tokenizer(data, lower=lower): clean_sentence, removed_from_input = remover(sentence) yield clean_sentence, len(clean_sentence), removed_from_input diff --git a/pie_extended/pipeline/postprocessor/glue.py b/pie_extended/pipeline/postprocessor/glue.py index 171d406..0749394 100644 --- a/pie_extended/pipeline/postprocessor/glue.py +++ b/pie_extended/pipeline/postprocessor/glue.py @@ -16,8 +16,8 @@ class GlueProcessor(RenamedTaskProcessor): # Glue Empty are value to take when all things glued together are empty GLUE_EMPTY: Dict[str, str] = {"morph": "MORPH=empty"} - def __init__(self): - super(GlueProcessor, self).__init__() + def __init__(self, *args, **kwargs): + super(GlueProcessor, self).__init__(*args, **kwargs) # Sets-up some copy of the values self._out = type(self).OUTPUT_KEYS @@ -33,7 +33,6 @@ def _yield_annotation( token_dict: Dict[str, str] ) -> Generator[str, None, None]: # For each key we should return - print(self.tasks) for head in self._out: if head not in self._glue: yield head, token_dict[head] diff --git a/pie_extended/pipeline/postprocessor/memory.py b/pie_extended/pipeline/postprocessor/memory.py index 83d7801..b86183e 100644 --- a/pie_extended/pipeline/postprocessor/memory.py +++ b/pie_extended/pipeline/postprocessor/memory.py @@ -26,4 +26,8 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: token_dict[self._key] = out_token token_dict["form"] = input_token - return token_dict \ No newline at end of file + return token_dict + + def reinsert(self, form: str) -> Dict[str, str]: + self.memory.tokens.pop(0) + return super(MemoryzingProcessor, self).reinsert(form) \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/proto.py b/pie_extended/pipeline/postprocessor/proto.py index 4a615f1..d8a7e49 100644 --- a/pie_extended/pipeline/postprocessor/proto.py +++ b/pie_extended/pipeline/postprocessor/proto.py @@ -13,7 +13,6 @@ def __init__(self, empty_value: Optional[str] = None): def set_tasks(self, tasks): self.tasks = tasks - print(tasks, self.tasks) def postprocess(self, line): pass @@ -33,9 +32,7 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: :param tags: :return: """ - print("Do I have task ?", self.tasks) - print({"form":token, **{k: val for k, val in zip(self.tasks, tags)}}) - return {"form":token, **{k: val for k, val in zip(self.tasks, tags)}} + return {"form": token, **{k: val for k, val in zip(self.tasks, tags)}} def reset(self): """ Functions that should be run in between documents """ @@ -43,14 +40,14 @@ def reset(self): class RenamedTaskProcessor(ProcessorPrototype): - MAP: Dict[str, str] + MAP: Dict[str, str] = {} def __init__(self, **kwargs): super(RenamedTaskProcessor, self).__init__(**kwargs) self._map: Dict[str, str] = type(self).MAP def set_tasks(self, tasks): - return [self._map.get(task, task) for task in tasks] + self.tasks = [self._map.get(task, task) for task in tasks] class ChainedProcessor(ProcessorPrototype): diff --git a/pie_extended/pipeline/tokenizers/memorizing.py b/pie_extended/pipeline/tokenizers/memorizing.py index b338529..13cd39e 100644 --- a/pie_extended/pipeline/tokenizers/memorizing.py +++ b/pie_extended/pipeline/tokenizers/memorizing.py @@ -1,45 +1,32 @@ -class MemorizingTokenizer(object): +from .simple_tokenizer import SimpleTokenizer +from typing import List, Tuple, Dict + + +class MemorizingTokenizer(SimpleTokenizer): """ Tokenizer that memoryze what it tokenized. Mostly used to normalized input as input time and then reinserting normalized input """ - @staticmethod - def _sentence_tokenizer(string): - for s in string.split("."): - if s.strip(): - yield s.strip() + " " + "." - - @staticmethod - def _word_tokenizer(string): - for s in string.split(): - if s.strip: - yield s.strip() - - @staticmethod - def _replacer(inp: str): - return inp - - def __init__(self, sentence_tokenizer=None, word_tokenizer=None, replacer=None, normalizer=None): - self.tokens = [ - ] - - self.sentence_tokenizer = sentence_tokenizer or self._sentence_tokenizer - self.word_tokenizer = word_tokenizer or self._word_tokenizer - self.replacer = replacer or self._replacer - self.normalizer = normalizer or self._replacer - - def __call__(self, data, lower=True): - if lower: - data = data.lower() - for sentence in self.sentence_tokenizer(data): - toks = self.word_tokenizer(sentence) - new_sentence = [] - - for tok in toks: - if tok: - out = self.replacer(tok) - self.tokens.append((len(self.tokens), tok, out)) - new_sentence.append(out) - if new_sentence: - yield new_sentence + + def replacer(self, token: str) -> str: + """ This function allows for changing input and keeping it in memory """ + return token + + def __init__(self): + self.tokens: List[Tuple[int, int, str]] = [] + + def _real_word_tokenizer(self, data: str, lower: bool = False) -> List[str]: + return super(MemorizingTokenizer, self).word_tokenizer(data, lower=lower) + + def word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + sentence = [] + for token in self._real_word_tokenizer(text, lower): + out = self.replacer(token) + self.tokens.append((len(self.tokens), token, out)) + sentence.append(out) + return sentence + + def reset(self): # Empty + self.tokens = [] + diff --git a/pie_extended/pipeline/tokenizers/simple_tokenizer.py b/pie_extended/pipeline/tokenizers/simple_tokenizer.py new file mode 100644 index 0000000..ef633a9 --- /dev/null +++ b/pie_extended/pipeline/tokenizers/simple_tokenizer.py @@ -0,0 +1,33 @@ +from typing import Generator, List +import regex as re +import string +from pie.tagger import regexsplitter, SECTION, FULLSTOP + +WORD = r'([{}])'.format(string.punctuation) + + +class SimpleTokenizer(object): + """ Tokenizer that memoryze what it tokenized. + + Mostly used to normalized input as input time and then reinserting normalized input + + """ + def __init__(self): + self.section = regexsplitter(SECTION) + self.fullstop = regexsplitter(FULLSTOP) + self.word = regexsplitter(WORD) + + def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]: + for line in self.section(text): + for sentence in self.fullstop(line): + yield self.word_tokenizer(sentence, lower=lower) + + def word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + sentence = [w for raw in text.split() for w in self.word(raw)] + if lower: + sentence = [w.lower() for w in sentence] + return sentence + + def reset(self): + """Can be used between documents for example """ + pass diff --git a/pie_extended/tagger.py b/pie_extended/tagger.py index 46741e9..413a3cf 100644 --- a/pie_extended/tagger.py +++ b/pie_extended/tagger.py @@ -47,7 +47,10 @@ def tag_str(self, data: str, iterator: DataIterator, processor: ProcessorPrototy def iter_tag_token(self, data: str, iterator: DataIterator, processor: ProcessorPrototype) \ -> Generator[Dict[str, str], None, None]: + # Reset at each document processor.reset() + iterator.tokenizer.reset() + # Iterate ! for chunk in utils.chunks( iterator(data, lower=self.lower), size=self.batch_size): @@ -61,7 +64,8 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor sents=[sent for sent in sents if sent], lengths=lengths ) - processor.set_tasks(tasks) + if not processor.tasks: + processor.set_tasks(tasks) # We keep a real sentence index for sents_index, sent_is_empty in enumerate(is_empty): @@ -84,7 +88,6 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor yield processor.reinsert(sent_reinsertion[reinsertion_index+index]) del sent_reinsertion[reinsertion_index + index] reinsertion_index += 1 - yield processor.get_dict(token, tags) for reinsertion in sorted(list(sent_reinsertion.keys())): @@ -97,7 +100,7 @@ def iter_tag(self, data: str, iterator: DataIterator, processor: type): if not formatter: formatter = Formatter(list(annotation.keys())) yield formatter.write_headers() - yield formatter.write_line(formatter) + yield formatter.write_line(annotation) if formatter: yield formatter.write_footer() \ No newline at end of file diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index 5527508..fa37827 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -1,4 +1,4 @@ -from pie_extended.models.lasla.classes import get_iterator_and_formatter +from pie_extended.models.lasla.classes import get_iterator_and_processor from pie_extended.testing_utils import FakeTagger from typing import List, Tuple @@ -21,8 +21,8 @@ def make_controller(sentences: List[str]): make_fake_data(sentences), tasks="lemma,Voice,Mood,Deg,Numb,Person,Tense,Case,Gend,pos".split(",") ) - iterator, formatter = get_iterator_and_formatter() - return tagger, iterator, formatter + iterator, processor = get_iterator_and_processor() + return tagger, iterator, processor class TestPonctuation(TestCase): @@ -32,7 +32,7 @@ def test_consecutive_dots(self): Found out the hard way it would break things """ - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ "id enim ait turbabuntur a facie eius patris or phanorum et iudicis uiduarum", "causam turbationis hanc docuit quod pater" ]) @@ -40,7 +40,7 @@ def test_consecutive_dots(self): result = tagger.tag_str( data="id enim ait turbabuntur a facie eius patris or phanorum et iudicis uiduarum ." " . causam turbationis hanc docuit quod pater", - postprocessing_class=formatter, + processor=processor, iterator=data_iterator ) self.assertIn( @@ -57,12 +57,12 @@ def test_leading_punctuation(self): Special case of consecutive dots, where sentences starts with it """ - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ "id enim ait", "turbabuntur a facie eius patris or phanorum et iudicis uiduarum" ]) result = tagger.tag_str( "( id enim ait ) turbabuntur a facie eius patris or phanorum et iudicis uiduarum . .", - postprocessing_class=formatter, + processor=processor, iterator=data_iterator ) self.assertIn( @@ -82,12 +82,12 @@ def test_punctuation_is_not_seen(self): """Check that punctuation is not seen by the tagger """ - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ "id enim ait", "turbabuntur a facie eius patris or phanorum et iudicis uiduarum" ]) tagger.tag_str( "( id enim ait ) turbabuntur a facie eius patris or phanorum et iudicis uiduarum . .", - postprocessing_class=formatter, + processor=processor, iterator=data_iterator ) self.assertNotIn( @@ -100,12 +100,12 @@ def test_j_are_temporarly_replaced(self): """Check that characters are replaced for the tagger, thus avoiding out of domain, and reinserted """ - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ "id enim ait", "turbabuntur a facie eius patris or phanorum et iudicis uiduarum" ]) result = tagger.tag_str( "( id enim ait ) turbabuntur a facie eius patris or phanorum et judicis uiduarum . .", - postprocessing_class=formatter, + processor=processor, iterator=data_iterator ) flatten_seen = list([tok for sent in tagger.seen for tok in sent]) @@ -116,12 +116,12 @@ def test_j_are_temporarly_replaced(self): def test_underscores(self): string = "una operatio in ecclesiae fundamento.._... _ . laetatur autem pater quia filius perierat" - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ "una operatio in ecclesiae fundamento", "laetatur autem pater quia filius perierat" ]) tagger.tag_str( string, - postprocessing_class=formatter, + processor=processor, iterator=data_iterator ) flatten_seen = list([tok for sent in tagger.seen for tok in sent]) From 3e519ca8880bc6b03ad0171576571a616850afe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Thu, 20 Feb 2020 19:47:54 +0100 Subject: [PATCH 04/13] Definitely bugging but not crashing though ! --- pie_extended/models/fro/__init__.py | 2 +- pie_extended/models/fro/get.py | 20 ++++++ pie_extended/models/fro/processor.py | 41 ++++++++++++ .../models/fro/{classes.py => tokenizer.py} | 62 +------------------ pie_extended/models/lasla/classes.py | 2 +- 5 files changed, 66 insertions(+), 61 deletions(-) create mode 100644 pie_extended/models/fro/get.py create mode 100644 pie_extended/models/fro/processor.py rename pie_extended/models/fro/{classes.py => tokenizer.py} (60%) diff --git a/pie_extended/models/fro/__init__.py b/pie_extended/models/fro/__init__.py index 63da07b..86506e2 100644 --- a/pie_extended/models/fro/__init__.py +++ b/pie_extended/models/fro/__init__.py @@ -1,5 +1,5 @@ from ...utils import Metadata, File, get_path -from .classes import get_iterator_and_processor +from .get import get_iterator_and_processor from ...pipeline.iterators.proto import DataIterator DESC = Metadata( diff --git a/pie_extended/models/fro/get.py b/pie_extended/models/fro/get.py new file mode 100644 index 0000000..04154d8 --- /dev/null +++ b/pie_extended/models/fro/get.py @@ -0,0 +1,20 @@ +from .processor import FroRulesProcessor, FroGlueProcessor +from .tokenizer import FroMemorizingTokenizer +from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor + + +def get_iterator_and_processor(): + tokenizer = FroMemorizingTokenizer() + processor = FroRulesProcessor( + MemoryzingProcessor( + tokenizer_memory=tokenizer, + head_processor=FroGlueProcessor() + ) + ) + iterator = DataIterator( + tokenizer=tokenizer, + remove_from_input=DataIterator.remove_punctuation + ) + return iterator, processor + diff --git a/pie_extended/models/fro/processor.py b/pie_extended/models/fro/processor.py new file mode 100644 index 0000000..5c54e7a --- /dev/null +++ b/pie_extended/models/fro/processor.py @@ -0,0 +1,41 @@ +import regex as re +from typing import Dict + +from pie_extended.pipeline.postprocessor.glue import GlueProcessor +from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor + + +class FroRulesProcessor(RuleBasedProcessor): + """ Fro Dataset has not all punctuation signs in it, we remove it and posttag it automatically + + """ + PONCTU = re.compile(r"^\W+$") + NUMBER = re.compile(r"\d+") + PONFORT = [".", "...", "!", "?"] + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + token = annotation["form"] + if self.PONCTU.match(token): + if token in self.PONFORT: + pos = "PONfrt" + else: + pos = "PONfbl" + return {"form": token, "lemma": token, "POS": pos, "morph": "MORPH=empty"} + elif self.NUMBER.match(token): + annotation["pos"] = "ADJcar" + return annotation + + def __init__(self, *args, **kwargs): + super(FroRulesProcessor, self).__init__(*args, **kwargs) + + +class FroGlueProcessor(GlueProcessor): + """ We glue morphological features into one column + + """ + OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] + GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]} + MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."} + + def __init__(self, *args, **kwargs): + super(FroGlueProcessor, self).__init__(*args, **kwargs) \ No newline at end of file diff --git a/pie_extended/models/fro/classes.py b/pie_extended/models/fro/tokenizer.py similarity index 60% rename from pie_extended/models/fro/classes.py rename to pie_extended/models/fro/tokenizer.py index 482b696..fb1c3e9 100644 --- a/pie_extended/models/fro/classes.py +++ b/pie_extended/models/fro/tokenizer.py @@ -1,11 +1,7 @@ import regex as re -from typing import List, Dict, Generator -from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer -from pie_extended.pipeline.iterators.proto import DataIterator -from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor -from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor -from pie_extended.pipeline.postprocessor.glue import GlueProcessor +from typing import List, Generator +from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer _Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“" _Dots_collections = r"[" + _Dots_except_apostrophe + "‘’]" @@ -76,56 +72,4 @@ def normalizer(self, data: str) -> str: ) ) ) - return data - - -class FroRulesProcessor(RuleBasedProcessor): - """ Fro Dataset has not all punctuation signs in it, we remove it and posttag it automatically - - """ - PONCTU = re.compile(r"^\W+$") - NUMBER = re.compile(r"\d+") - PONFORT = [".", "...", "!", "?"] - - def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: - token = annotation["form"] - if self.PONCTU.match(token): - if token in self.PONFORT: - pos = "PONfrt" - else: - pos = "PONfbl" - return {"form": token, "lemma": token, "POS": pos, "morph": "MORPH=empty"} - elif self.NUMBER.match(token): - annotation["pos"] = "ADJcar" - return annotation - - def __init__(self, *args, **kwargs): - super(FroRulesProcessor, self).__init__(*args, **kwargs) - - -class FroGlueProcessor(GlueProcessor): - """ We glue morphological features into one column - - """ - OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] - GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]} - MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."} - - def __init__(self, *args, **kwargs): - super(FroGlueProcessor, self).__init__(*args, **kwargs) - - -def get_iterator_and_processor(): - tokenizer = FroMemorizingTokenizer() - processor = FroRulesProcessor( - MemoryzingProcessor( - tokenizer_memory=tokenizer, - head_processor=FroGlueProcessor() - ) - ) - iterator = DataIterator( - tokenizer=tokenizer, - remove_from_input=DataIterator.remove_punctuation - ) - return iterator, processor - + return data \ No newline at end of file diff --git a/pie_extended/models/lasla/classes.py b/pie_extended/models/lasla/classes.py index fd0ad69..001ed89 100644 --- a/pie_extended/models/lasla/classes.py +++ b/pie_extended/models/lasla/classes.py @@ -7,7 +7,7 @@ from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor from pie_extended.pipeline.postprocessor.glue import GlueProcessor from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer -from pie_extended.models.fro.classes import _RomanNumber, _Dots_except_apostrophe, _Dots_collections +from pie_extended.models.fro.tokenizer import _Dots_except_apostrophe, _Dots_collections, _RomanNumber try: import cltk From 9914265c8b0c4e172b2dc5c93ab3cbf78be97332 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Thu, 20 Feb 2020 19:49:57 +0100 Subject: [PATCH 05/13] (Refactored latin as well) --- pie_extended/models/lasla/__init__.py | 2 +- pie_extended/models/lasla/get.py | 24 +++++++ pie_extended/models/lasla/processor.py | 39 +++++++++++ .../models/lasla/{classes.py => tokenizer.py} | 66 ++----------------- 4 files changed, 69 insertions(+), 62 deletions(-) create mode 100644 pie_extended/models/lasla/get.py create mode 100644 pie_extended/models/lasla/processor.py rename pie_extended/models/lasla/{classes.py => tokenizer.py} (63%) diff --git a/pie_extended/models/lasla/__init__.py b/pie_extended/models/lasla/__init__.py index 7074413..512e9f5 100644 --- a/pie_extended/models/lasla/__init__.py +++ b/pie_extended/models/lasla/__init__.py @@ -1,2 +1,2 @@ from pie_extended.models.lasla.consts import DOWNLOADS, Models, Disambiguator, addons, DESC -from pie_extended.models.lasla.classes import get_iterator_and_processor +from pie_extended.models.lasla.get import get_iterator_and_processor diff --git a/pie_extended/models/lasla/get.py b/pie_extended/models/lasla/get.py new file mode 100644 index 0000000..1b83700 --- /dev/null +++ b/pie_extended/models/lasla/get.py @@ -0,0 +1,24 @@ +import regex as re + +from pie_extended.models.lasla.processor import LatinRulesProcessor, LatinGlueProcessor +from pie_extended.models.lasla.tokenizer import LatMemorizingTokenizer +from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor + +# Uppercase regexp +uppercase = re.compile(r"^[A-Z]$") + + +def get_iterator_and_processor(): + tokenizer = LatMemorizingTokenizer() + processor = LatinRulesProcessor( + MemoryzingProcessor( + tokenizer_memory=tokenizer, + head_processor=LatinGlueProcessor() + ) + ) + iterator = DataIterator( + tokenizer=tokenizer, + remove_from_input=DataIterator.remove_punctuation + ) + return iterator, processor diff --git a/pie_extended/models/lasla/processor.py b/pie_extended/models/lasla/processor.py new file mode 100644 index 0000000..c4293b0 --- /dev/null +++ b/pie_extended/models/lasla/processor.py @@ -0,0 +1,39 @@ +import regex as re +from typing import Dict + +from pie_extended.pipeline.postprocessor.glue import GlueProcessor +from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor + + +class LatinRulesProcessor(RuleBasedProcessor): + """ Lasla data has no punctuation, we tag it automatically. + + "ne" token can be two different lemma, but I don't remember why I wrote this part. (ne/nec ?) + + """ + PONCTU = re.compile(r"^\W+$") + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + # If Else condition + token = annotation["form"] + if self.PONCTU.match(token): + return {"form": token, "lemma": token, "pos": "PUNC", "morph": "MORPH=empty"} + elif token.startswith("-"): + if token == "-ne": + annotation["lemma"] = "ne2" + else: + annotation["lemma"] = "ne" + return annotation + + def __init__(self, *args, **kwargs): + super(LatinRulesProcessor, self).__init__(*args, **kwargs) + + +class LatinGlueProcessor(GlueProcessor): + OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] + GLUE = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]} + WHEN_EMPTY = {"morph": "MORPH=empty"} + MAP = {"pos": "POS"} + + def __init__(self, *args, **kwargs): + super(LatinGlueProcessor, self).__init__(*args, **kwargs) \ No newline at end of file diff --git a/pie_extended/models/lasla/classes.py b/pie_extended/models/lasla/tokenizer.py similarity index 63% rename from pie_extended/models/lasla/classes.py rename to pie_extended/models/lasla/tokenizer.py index 001ed89..7d1f6b6 100644 --- a/pie_extended/models/lasla/classes.py +++ b/pie_extended/models/lasla/tokenizer.py @@ -1,13 +1,10 @@ -from typing import Dict, List, Generator -import sys import regex as re import click -from pie_extended.pipeline.iterators.proto import DataIterator -from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor -from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor -from pie_extended.pipeline.postprocessor.glue import GlueProcessor +import sys +from typing import List, Generator + +from pie_extended.models.fro.tokenizer import _Dots_except_apostrophe, _RomanNumber from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer -from pie_extended.models.fro.tokenizer import _Dots_except_apostrophe, _Dots_collections, _RomanNumber try: import cltk @@ -19,44 +16,6 @@ sys.exit(0) -class LatinRulesProcessor(RuleBasedProcessor): - """ Lasla data has no punctuation, we tag it automatically. - - "ne" token can be two different lemma, but I don't remember why I wrote this part. (ne/nec ?) - - """ - PONCTU = re.compile(r"^\W+$") - - def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: - # If Else condition - token = annotation["form"] - if self.PONCTU.match(token): - return {"form": token, "lemma": token, "pos": "PUNC", "morph": "MORPH=empty"} - elif token.startswith("-"): - if token == "-ne": - annotation["lemma"] = "ne2" - else: - annotation["lemma"] = "ne" - return annotation - - def __init__(self, *args, **kwargs): - super(LatinRulesProcessor, self).__init__(*args, **kwargs) - - -class LatinGlueProcessor(GlueProcessor): - OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] - GLUE = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]} - WHEN_EMPTY = {"morph": "MORPH=empty"} - MAP = {"pos": "POS"} - - def __init__(self, *args, **kwargs): - super(LatinGlueProcessor, self).__init__(*args, **kwargs) - - -# Uppercase regexp -uppercase = re.compile(r"^[A-Z]$") - - class LatMemorizingTokenizer(MemorizingTokenizer): re_add_space_around_punct = re.compile(r"(\s*)(\.+[^\w\s\'’ʼ])(\s*)") re_add_space_around_apostrophe_that_are_quotes = re.compile( @@ -129,19 +88,4 @@ def replacer(self, inp: str): # # On the other hand, it creates empty tokens... # data = MemorizingTokenizer.re_add_space_around_punct.sub(" \g<2> ", data) # data = MemorizingTokenizer.re_normalize_space.sub(" ", data) - # return data - - -def get_iterator_and_processor(): - tokenizer = LatMemorizingTokenizer() - processor = LatinRulesProcessor( - MemoryzingProcessor( - tokenizer_memory=tokenizer, - head_processor=LatinGlueProcessor() - ) - ) - iterator = DataIterator( - tokenizer=tokenizer, - remove_from_input=DataIterator.remove_punctuation - ) - return iterator, processor + # return data \ No newline at end of file From dce914870a0c53e53a5c18a5357b2b0bfd987467 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 12:22:58 +0100 Subject: [PATCH 06/13] (Architecture changes) Fixed issues with rules not applied on reinsert. Fixed writing that was writing keys instead of values --- pie_extended/cli/__init__.py | 9 ++++++++- pie_extended/models/fro/get.py | 3 ++- pie_extended/models/fro/processor.py | 4 +++- pie_extended/models/lasla/get.py | 3 ++- pie_extended/models/lasla/processor.py | 2 +- pie_extended/pipeline/postprocessor/glue.py | 18 +++++++++++++----- .../pipeline/postprocessor/rulebased.py | 13 ++++++++++++- pie_extended/tagger.py | 14 +------------- tests/test_models/test_lasla.py | 2 +- 9 files changed, 43 insertions(+), 25 deletions(-) diff --git a/pie_extended/cli/__init__.py b/pie_extended/cli/__init__.py index 0742dbe..b4d727c 100644 --- a/pie_extended/cli/__init__.py +++ b/pie_extended/cli/__init__.py @@ -58,7 +58,14 @@ def tag(model, filepath, allowed_failure, batch_size, device, debug, model_path) """ Tag as many [filepath] as you want with [model] """ from tqdm import tqdm click.echo(click.style("Getting the tagger", bold=True)) - tagger = sub.get_tagger(model, batch_size=batch_size, device=device, model_path=model_path) + try: + tagger = sub.get_tagger(model, batch_size=batch_size, device=device, model_path=model_path) + except FileNotFoundError as e: + click.echo("Model not found: please make sure you have downloaded the model files with " + "pie-extended download " + model) + if debug: + raise e + return failures = [] for file in tqdm(filepath): try: diff --git a/pie_extended/models/fro/get.py b/pie_extended/models/fro/get.py index 04154d8..4bd2a43 100644 --- a/pie_extended/models/fro/get.py +++ b/pie_extended/models/fro/get.py @@ -7,7 +7,8 @@ def get_iterator_and_processor(): tokenizer = FroMemorizingTokenizer() processor = FroRulesProcessor( - MemoryzingProcessor( + apply_on_reinsert=True, + head_processor=MemoryzingProcessor( tokenizer_memory=tokenizer, head_processor=FroGlueProcessor() ) diff --git a/pie_extended/models/fro/processor.py b/pie_extended/models/fro/processor.py index 5c54e7a..d17e51f 100644 --- a/pie_extended/models/fro/processor.py +++ b/pie_extended/models/fro/processor.py @@ -20,7 +20,7 @@ def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: pos = "PONfrt" else: pos = "PONfbl" - return {"form": token, "lemma": token, "POS": pos, "morph": "MORPH=empty"} + return {"form": token, "lemma": token, "POS": pos, "morph": "MORPH=empty", "treated": token} elif self.NUMBER.match(token): annotation["pos"] = "ADJcar" return annotation @@ -36,6 +36,8 @@ class FroGlueProcessor(GlueProcessor): OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]} MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."} + EMPTY_TAG: Dict[str, str] = {"CAS": "_", "NOMB.": "_", "DEGRE": "_", "MODE": "_", "TEMPS": "_", "GENRE": "_", + "PERS.": "_"} def __init__(self, *args, **kwargs): super(FroGlueProcessor, self).__init__(*args, **kwargs) \ No newline at end of file diff --git a/pie_extended/models/lasla/get.py b/pie_extended/models/lasla/get.py index 1b83700..3c6e582 100644 --- a/pie_extended/models/lasla/get.py +++ b/pie_extended/models/lasla/get.py @@ -12,7 +12,8 @@ def get_iterator_and_processor(): tokenizer = LatMemorizingTokenizer() processor = LatinRulesProcessor( - MemoryzingProcessor( + apply_on_reinsert=True, + head_processor=MemoryzingProcessor( tokenizer_memory=tokenizer, head_processor=LatinGlueProcessor() ) diff --git a/pie_extended/models/lasla/processor.py b/pie_extended/models/lasla/processor.py index c4293b0..8af49bd 100644 --- a/pie_extended/models/lasla/processor.py +++ b/pie_extended/models/lasla/processor.py @@ -17,7 +17,7 @@ def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: # If Else condition token = annotation["form"] if self.PONCTU.match(token): - return {"form": token, "lemma": token, "pos": "PUNC", "morph": "MORPH=empty"} + return {"form": token, "lemma": token, "pos": "PUNC", "morph": "MORPH=empty", "treated": token} elif token.startswith("-"): if token == "-ne": annotation["lemma"] = "ne2" diff --git a/pie_extended/pipeline/postprocessor/glue.py b/pie_extended/pipeline/postprocessor/glue.py index 0749394..6a54ed4 100644 --- a/pie_extended/pipeline/postprocessor/glue.py +++ b/pie_extended/pipeline/postprocessor/glue.py @@ -15,15 +15,19 @@ class GlueProcessor(RenamedTaskProcessor): GLUE_CHAR: str = "|" # Glue Empty are value to take when all things glued together are empty GLUE_EMPTY: Dict[str, str] = {"morph": "MORPH=empty"} + # Value that means the current element is empty + EMPTY_TAG: Dict[str, str] = {"Case": "_", "Numb": "_", "Deg": "_", "Mood": "_", "Tense": "_", "Voice": "_", + "Person": "_"} def __init__(self, *args, **kwargs): super(GlueProcessor, self).__init__(*args, **kwargs) # Sets-up some copy of the values - self._out = type(self).OUTPUT_KEYS - self._glue = type(self).GLUE - self._glue_char = type(self).GLUE_CHAR - self._glue_empty = type(self).GLUE_EMPTY + self._out = self.OUTPUT_KEYS + self._glue = self.GLUE + self._glue_char = self.GLUE_CHAR + self._glue_empty = self.GLUE_EMPTY + self._empty_tags = self.EMPTY_TAG def set_tasks(self, tasks): super(GlueProcessor, self).set_tasks(tasks) @@ -38,7 +42,11 @@ def _yield_annotation( yield head, token_dict[head] else: # Otherwise, we glue together things that should be glued together - joined = self._glue_char.join([token_dict[glued_task] for glued_task in self._glue[head]]) + joined = self._glue_char.join([ + glued_task + "=" + token_dict[glued_task] + for glued_task in self._glue[head] + if token_dict[glued_task] != self._empty_tags.get(glued_task, -1) + ]) if not joined: joined = self._glue_empty[head] yield head, joined diff --git a/pie_extended/pipeline/postprocessor/rulebased.py b/pie_extended/pipeline/postprocessor/rulebased.py index 279f97e..d2d5f1b 100644 --- a/pie_extended/pipeline/postprocessor/rulebased.py +++ b/pie_extended/pipeline/postprocessor/rulebased.py @@ -10,12 +10,23 @@ class RuleBasedProcessor(ChainedProcessor): """ KEY: str = "treated" - def __init__(self, head_processor: Optional[ProcessorPrototype], **kwargs): + def __init__(self, apply_on_reinsert: bool = False, head_processor: Optional[ProcessorPrototype] = None, **kwargs): + """ Apply rules on output of the taggers + + :param apply_on_reinsert: Apply rules on reinsert task + """ super(RuleBasedProcessor, self).__init__(head_processor=head_processor, **kwargs) self._key: str = type(self).KEY + self.apply_on_reinsert= apply_on_reinsert def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: return annotation + def reinsert(self, form: str) -> Dict[str, str]: + anno = super(RuleBasedProcessor, self).reinsert(form) + if self.apply_on_reinsert: + return self.rules(anno) + return anno + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: return self.rules(self.head_processor.get_dict(token, tags)) \ No newline at end of file diff --git a/pie_extended/tagger.py b/pie_extended/tagger.py index 413a3cf..90d9ab6 100644 --- a/pie_extended/tagger.py +++ b/pie_extended/tagger.py @@ -19,18 +19,6 @@ def __init__(self, device='cpu', batch_size=100, lower=False, disambiguation=Non ) self.disambiguation: Optional[Disambiguator] = disambiguation - def reinsert_full(self, formatter, sent_reinsertion, tasks): - yield formatter.write_sentence_beginning() - # If a sentence is empty, it's most likely because everything is in sent_reinsertions - for reinsertion in sorted(list(sent_reinsertion.keys())): - yield formatter.write_line( - formatter.format_line( - token=sent_reinsertion[reinsertion], - tags=[""] * len(tasks) - ) - ) - yield formatter.write_sentence_end() - def tag_file(self, fpath: str, iterator: DataIterator, processor: ProcessorPrototype): # Read content of the file with open(fpath) as f: @@ -100,7 +88,7 @@ def iter_tag(self, data: str, iterator: DataIterator, processor: type): if not formatter: formatter = Formatter(list(annotation.keys())) yield formatter.write_headers() - yield formatter.write_line(annotation) + yield formatter.write_line(formatter.format_line(annotation)) if formatter: yield formatter.write_footer() \ No newline at end of file diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index fa37827..f9a1d09 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -1,4 +1,4 @@ -from pie_extended.models.lasla.classes import get_iterator_and_processor +from pie_extended.models.lasla.get import get_iterator_and_processor from pie_extended.testing_utils import FakeTagger from typing import List, Tuple From e5e68bb52ff68292176d1fe3c7401438ff1ec900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 13:06:59 +0100 Subject: [PATCH 07/13] (Fixing tags) --- pie_extended/models/fro/tokenizer.py | 7 +++-- pie_extended/models/lasla/tokenizer.py | 25 ++------------- pie_extended/pipeline/postprocessor/memory.py | 2 +- .../pipeline/tokenizers/memorizing.py | 2 +- pie_extended/tagger.py | 5 +-- tests/test_models/test_lasla.py | 31 ++++++++----------- 6 files changed, 25 insertions(+), 47 deletions(-) diff --git a/pie_extended/models/fro/tokenizer.py b/pie_extended/models/fro/tokenizer.py index fb1c3e9..bd7c80f 100644 --- a/pie_extended/models/fro/tokenizer.py +++ b/pie_extended/models/fro/tokenizer.py @@ -14,7 +14,7 @@ class FroMemorizingTokenizer(MemorizingTokenizer): - re_add_space_around_punct = re.compile(r"(\s*)(\.+[^\w\s\'’ʼ])(\s*)") + re_add_space_around_punct = re.compile(r"(\s*)([^\w\s\'’ʼ])(\s*)") re_add_space_around_apostrophe_that_are_quotes = re.compile( r"((((?<=[\W])[\'’ʼ]+(?=[\W]))|((?<=[\w])[\'’ʼ]+(?=[\W]))|((?<=[\W])[\'’ʼ]+(?=[\w]))))" # NotLetter+Apo+NotLetter or Letter+Apo+NotLetter or NotLetter+Apo+Letter @@ -72,4 +72,7 @@ def normalizer(self, data: str) -> str: ) ) ) - return data \ No newline at end of file + return data + + def replacer(self, inp: str): + return self.re_remove_ending_apostrophe.sub("", inp) \ No newline at end of file diff --git a/pie_extended/models/lasla/tokenizer.py b/pie_extended/models/lasla/tokenizer.py index 7d1f6b6..68b55ae 100644 --- a/pie_extended/models/lasla/tokenizer.py +++ b/pie_extended/models/lasla/tokenizer.py @@ -17,14 +17,7 @@ class LatMemorizingTokenizer(MemorizingTokenizer): - re_add_space_around_punct = re.compile(r"(\s*)(\.+[^\w\s\'’ʼ])(\s*)") - re_add_space_around_apostrophe_that_are_quotes = re.compile( - r"((((?<=[\W])[\'’ʼ]+(?=[\W]))|((?<=[\w])[\'’ʼ]+(?=[\W]))|((?<=[\W])[\'’ʼ]+(?=[\w]))))" - # NotLetter+Apo+NotLetter or Letter+Apo+NotLetter or NotLetter+Apo+Letter - # ?'. or manger'_ or _'Bonjour - ) - re_add_space_after_apostrophe = re.compile(r"(\s*)([\'’ʼ])(\s*)") - re_remove_ending_apostrophe = re.compile(r"(?<=\w)([\'’ʼ])") + re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)") _sentence_boundaries = re.compile( r"([" + _Dots_except_apostrophe + r"]+\s*)+" ) @@ -63,29 +56,15 @@ def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[s yield from sentences def normalizer(self, data: str) -> str: - data = self.re_remove_ending_apostrophe.sub( - r"\g<1> ", - self.re_add_space_around_apostrophe_that_are_quotes.sub( - r" \g<2> ", - self.re_add_space_around_punct.sub( + data = self.re_add_space_around_punct.sub( r" \g<2> ", self.roman_number_dot.sub( r"_DOT_\g<1>_DOT_", data ) ) - ) - ) return data def replacer(self, inp: str): inp = inp.replace("V", "U").replace("v", "u").replace("J", "I").replace("j", "i") return inp - - #def normalizer(self, data: str): - # # Fix regarding the current issue of apostrophe - # # https://github.com/cltk/cltk/issues/925#issuecomment-522065530 - # # On the other hand, it creates empty tokens... - # data = MemorizingTokenizer.re_add_space_around_punct.sub(" \g<2> ", data) - # data = MemorizingTokenizer.re_normalize_space.sub(" ", data) - # return data \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/memory.py b/pie_extended/pipeline/postprocessor/memory.py index b86183e..0ab69ae 100644 --- a/pie_extended/pipeline/postprocessor/memory.py +++ b/pie_extended/pipeline/postprocessor/memory.py @@ -30,4 +30,4 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: def reinsert(self, form: str) -> Dict[str, str]: self.memory.tokens.pop(0) - return super(MemoryzingProcessor, self).reinsert(form) \ No newline at end of file + return super(MemoryzingProcessor, self).reinsert(form) diff --git a/pie_extended/pipeline/tokenizers/memorizing.py b/pie_extended/pipeline/tokenizers/memorizing.py index 13cd39e..7940e80 100644 --- a/pie_extended/pipeline/tokenizers/memorizing.py +++ b/pie_extended/pipeline/tokenizers/memorizing.py @@ -14,7 +14,7 @@ def replacer(self, token: str) -> str: return token def __init__(self): - self.tokens: List[Tuple[int, int, str]] = [] + self.tokens: List[Tuple[int, str, str]] = [] def _real_word_tokenizer(self, data: str, lower: bool = False) -> List[str]: return super(MemorizingTokenizer, self).word_tokenizer(data, lower=lower) diff --git a/pie_extended/tagger.py b/pie_extended/tagger.py index 90d9ab6..604e401 100644 --- a/pie_extended/tagger.py +++ b/pie_extended/tagger.py @@ -46,8 +46,7 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor # to be reinserted sents, lengths, needs_reinsertion = zip(*chunk) - is_empty = [0 == len(sent) for sent in enumerate(sents)] - + is_empty = [not bool(sent) for sent in sents] tagged, tasks = self.tag( sents=[sent for sent in sents if sent], lengths=lengths @@ -72,10 +71,12 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor reinsertion_index = 0 for index, (token, tags) in enumerate(sent): + # Before current index while reinsertion_index + index in sent_reinsertion: yield processor.reinsert(sent_reinsertion[reinsertion_index+index]) del sent_reinsertion[reinsertion_index + index] reinsertion_index += 1 + yield processor.get_dict(token, tags) for reinsertion in sorted(list(sent_reinsertion.keys())): diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index f9a1d09..156bde5 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -43,12 +43,11 @@ def test_consecutive_dots(self): processor=processor, iterator=data_iterator ) - self.assertIn( - "uiduarum uiduarum fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person=fake" - " uiduarum\r\n" - ". . PUNC MORPH=empty .\r\n" - ". . PUNC MORPH=empty .", - result, + self.assertEqual( + result[12], + {"form": "uiduarum", "lemma": "uiduarum", "POS": "fake", "morph": "Case=fake|Numb=fake|Deg=fake|Mood=fake|" + "Tense=fake|Voice=fake|Person=fake", + "treated": "uiduarum"}, "Punctuation should be reinserted and mostly should not break anything" ) @@ -58,23 +57,19 @@ def test_leading_punctuation(self): Special case of consecutive dots, where sentences starts with it """ tagger, data_iterator, processor = make_controller([ + # Need an empty sentence because ( was treated as such "id enim ait", "turbabuntur a facie eius patris or phanorum et iudicis uiduarum" ]) result = tagger.tag_str( - "( id enim ait ) turbabuntur a facie eius patris or phanorum et iudicis uiduarum . .", + "( id enim ait) turbabuntur a facie eius patris or phanorum et iudicis uiduarum ..", processor=processor, iterator=data_iterator ) - self.assertIn( - "form lemma POS morph treated_token\r\n" - "( ( PUNC MORPH=empty (\r\n" - "id id fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person=fake id\r\n" - "enim enim fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person=fake enim\r\n" - "ait ait fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person=fake ait\r\n" - ") ) PUNC MORPH=empty )\r\n" - "turbabuntur turbabuntur fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person" - "=fake turbabuntur\r\n", - result, + tokens = [t["form"] for t in result] + self.assertEqual( + ["(", "id", "enim", "ait", ")", "turbabuntur", "a", "facie", "eius", "patris", "or", "phanorum", + "et", "iudicis", "uiduarum", ".", "."], + tokens, "Leading punctuation should not break anything" ) @@ -130,4 +125,4 @@ def test_underscores(self): 'perierat'], flatten_seen, "Seen element should not count the underscord" - ) \ No newline at end of file + ) From ce97f7474612667473a4393b18f32febbfeb85a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 14:38:59 +0100 Subject: [PATCH 08/13] Fixed tests for Lasla --- tests/test_models/test_lasla.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index 156bde5..c86cd27 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -96,18 +96,17 @@ def test_j_are_temporarly_replaced(self): """ tagger, data_iterator, processor = make_controller([ - "id enim ait", "turbabuntur a facie eius patris or phanorum et iudicis uiduarum" + "iudicis uiduarum" ]) result = tagger.tag_str( - "( id enim ait ) turbabuntur a facie eius patris or phanorum et judicis uiduarum . .", + "judicis uiduarum", processor=processor, iterator=data_iterator ) flatten_seen = list([tok for sent in tagger.seen for tok in sent]) - self.assertNotIn("judicis", flatten_seen, "'j' should be removed from tagging") - self.assertIn("iudicis", flatten_seen, "And 'i' should replace it") - self.assertIn("\njudicis\t", result, "But, in the end, the original form is given to the user") + self.assertEqual(result[0]["form"], "judicis", "'j' should be removed from tagging") + self.assertEqual(result[0]["treated"], "iudicis", "And 'i' should replace it") def test_underscores(self): string = "una operatio in ecclesiae fundamento.._... _ . laetatur autem pater quia filius perierat" From 0943043b20d252bbb60b54bb0ebcf9634724fbf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 16:03:01 +0100 Subject: [PATCH 09/13] Remaining bugs but overall not bad --- pie_extended/models/fro/processor.py | 2 +- pie_extended/models/fro/tokenizer.py | 11 ++++++--- tests/test_models/test_fro.py | 37 ++++++++++++++++++++++++++++ tests/test_models/test_lasla.py | 2 +- 4 files changed, 47 insertions(+), 5 deletions(-) create mode 100644 tests/test_models/test_fro.py diff --git a/pie_extended/models/fro/processor.py b/pie_extended/models/fro/processor.py index d17e51f..21e64ba 100644 --- a/pie_extended/models/fro/processor.py +++ b/pie_extended/models/fro/processor.py @@ -40,4 +40,4 @@ class FroGlueProcessor(GlueProcessor): "PERS.": "_"} def __init__(self, *args, **kwargs): - super(FroGlueProcessor, self).__init__(*args, **kwargs) \ No newline at end of file + super(FroGlueProcessor, self).__init__(*args, **kwargs) diff --git a/pie_extended/models/fro/tokenizer.py b/pie_extended/models/fro/tokenizer.py index bd7c80f..3993587 100644 --- a/pie_extended/models/fro/tokenizer.py +++ b/pie_extended/models/fro/tokenizer.py @@ -16,8 +16,13 @@ class FroMemorizingTokenizer(MemorizingTokenizer): re_add_space_around_punct = re.compile(r"(\s*)([^\w\s\'’ʼ])(\s*)") re_add_space_around_apostrophe_that_are_quotes = re.compile( - r"((((?<=[\W])[\'’ʼ]+(?=[\W]))|((?<=[\w])[\'’ʼ]+(?=[\W]))|((?<=[\W])[\'’ʼ]+(?=[\w]))))" - # NotLetter+Apo+NotLetter or Letter+Apo+NotLetter or NotLetter+Apo+Letter + r"(" + r"(((?<=[\W])[\'’ʼ]+(?=[\W]))|" + r"((?<=[\w])[\'’ʼ]+(?=[\W]))|" + r"((?<=[\W])[\'’ʼ]+(?=[\w])))|" + r"(^[\'’ʼ]+)|" + r"([\'’ʼ]+$))" + # NotLetter+Apo+NotLetter or Letter+Apo+NotLetter or NotLetter+Apo+Letter + Starting or ending apostrophe # ?'. or manger'_ or _'Bonjour ) re_add_space_after_apostrophe = re.compile(r"(\s*)([\'’ʼ])(\s*)") @@ -75,4 +80,4 @@ def normalizer(self, data: str) -> str: return data def replacer(self, inp: str): - return self.re_remove_ending_apostrophe.sub("", inp) \ No newline at end of file + return self.re_remove_ending_apostrophe.sub("", inp) diff --git a/tests/test_models/test_fro.py b/tests/test_models/test_fro.py new file mode 100644 index 0000000..accce3d --- /dev/null +++ b/tests/test_models/test_fro.py @@ -0,0 +1,37 @@ +from pie_extended.models.fro.get import get_iterator_and_processor +from pie_extended.testing_utils import FakeTagger +from typing import List, Tuple + +from unittest import TestCase +from .test_lasla import make_fake_data + + +def make_controller(sentences: List[str]): + # Add the lemmatizer routes + tagger = FakeTagger( + make_fake_data(sentences), + tasks="lemma,MODE,TEMPS,PERS,NOMB,GENRE,CAS,DEGRE,POS".split(",") + ) + iterator, processor = get_iterator_and_processor() + return tagger, iterator, processor + + +class TestFro(TestCase): + def test_elision_apostrophe(self): + string = "q'il meurt" + treated = ["q il meurt"] + tagger, it, pro = make_controller(treated) + out = tagger.tag_str(string, it, pro) + self.assertEqual(out[0]["form"], "q'") + self.assertEqual(out[0]["treated"], "q") + + def test_elision_apostrophe_and_quote(self): + string = "a q'il meurt 'dit il'" + treated = ["a q il meurt dit il"] + tagger, it, pro = make_controller(treated) + out = tagger.tag_str(string, it, pro) + self.assertEqual(out[0]["form"], "a") + self.assertEqual(out[0]["treated"], "a") + self.assertEqual(out[1]["form"], "q'") + self.assertEqual(out[1]["treated"], "q") + # Ending and starting apostrophe are not reinserted for some reason. diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index c86cd27..1359afc 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -25,7 +25,7 @@ def make_controller(sentences: List[str]): return tagger, iterator, processor -class TestPonctuation(TestCase): +class TestLasla(TestCase): def test_consecutive_dots(self): """Check that consecutive punctation does not break anything From 1166fcf74847675e88c20361e4903c4ee7233c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 16:52:29 +0100 Subject: [PATCH 10/13] Fro is tested --- pie_extended/models/fro/tokenizer.py | 31 +++++++++++++++------------- tests/test_models/test_fro.py | 9 ++++---- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/pie_extended/models/fro/tokenizer.py b/pie_extended/models/fro/tokenizer.py index 3993587..bac16b6 100644 --- a/pie_extended/models/fro/tokenizer.py +++ b/pie_extended/models/fro/tokenizer.py @@ -14,7 +14,9 @@ class FroMemorizingTokenizer(MemorizingTokenizer): - re_add_space_around_punct = re.compile(r"(\s*)([^\w\s\'’ʼ])(\s*)") + APOSTROPHES = "'’ʼ" + re_elision_apostrophe = re.compile(r"(\w+)([" + APOSTROPHES + r"])(\w+)") + re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)") re_add_space_around_apostrophe_that_are_quotes = re.compile( r"(" r"(((?<=[\W])[\'’ʼ]+(?=[\W]))|" @@ -42,10 +44,11 @@ def _sentence_tokenizer_merge_matches(match): start, end = match.span() return match.string[start:end] + "" - @classmethod - def _real_sentence_tokenizer(cls, string: str) -> List[str]: - string = cls._sentence_boundaries.sub(cls._sentence_tokenizer_merge_matches, string) + def _real_sentence_tokenizer(self, string: str) -> List[str]: + string = self._sentence_boundaries.sub(self._sentence_tokenizer_merge_matches, string) string = string.replace("_DOT_", ".") + for index_apo, apo in enumerate(self.APOSTROPHES): + string = string.replace("ApOsTrOpHe"+str(index_apo), apo+" ") return string.split("") def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]: @@ -63,19 +66,19 @@ def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[s sentences.append(self.word_tokenizer(sent)) yield from sentences + def apostrophe_replace(self, regex_match) -> str: + return regex_match.group(1) + "ApOsTrOpHe"+ str(self.APOSTROPHES.index(regex_match.group(2))) + regex_match.group(3) + def normalizer(self, data: str) -> str: - data = self.re_remove_ending_apostrophe.sub( - r"\g<1> ", - self.re_add_space_around_apostrophe_that_are_quotes.sub( - r" \g<2> ", - self.re_add_space_around_punct.sub( + data = self.re_add_space_around_punct.sub( r" \g<2> ", - self.roman_number_dot.sub( - r"_DOT_\g<1>_DOT_", - data + self.re_elision_apostrophe.sub( + self.apostrophe_replace, + self.roman_number_dot.sub( + r"_DOT_\g<1>_DOT_", + data + ) ) - ) - ) ) return data diff --git a/tests/test_models/test_fro.py b/tests/test_models/test_fro.py index accce3d..9f80f6d 100644 --- a/tests/test_models/test_fro.py +++ b/tests/test_models/test_fro.py @@ -26,12 +26,13 @@ def test_elision_apostrophe(self): self.assertEqual(out[0]["treated"], "q") def test_elision_apostrophe_and_quote(self): - string = "a q'il meurt 'dit il'" - treated = ["a q il meurt dit il"] + string = "'q'il meurt 'dit il'" + treated = ["q il meurt dit il"] tagger, it, pro = make_controller(treated) out = tagger.tag_str(string, it, pro) - self.assertEqual(out[0]["form"], "a") - self.assertEqual(out[0]["treated"], "a") + self.assertEqual(out[0]["form"], "'") + self.assertEqual(out[0]["treated"], "'") self.assertEqual(out[1]["form"], "q'") self.assertEqual(out[1]["treated"], "q") + self.assertEqual(out[-1]["form"], "'", "Last apostrophe is kept") # Ending and starting apostrophe are not reinserted for some reason. From 01c9dd4940aa5c0bfd6a68200224324654a8d19b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 16:57:49 +0100 Subject: [PATCH 11/13] Updated error for pie-extended install lasla & added a test for roman number in Fro --- pie_extended/models/lasla/tokenizer.py | 2 +- tests/test_models/test_fro.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pie_extended/models/lasla/tokenizer.py b/pie_extended/models/lasla/tokenizer.py index 68b55ae..14f6d29 100644 --- a/pie_extended/models/lasla/tokenizer.py +++ b/pie_extended/models/lasla/tokenizer.py @@ -12,7 +12,7 @@ except ImportError as E: click.echo(click.style("You need to install cltk and its Latin Data to runs this package", fg="red")) click.echo("pip install cltk") - click.echo("pie-ext install-addons lasla") + click.echo("pie-extended install-addons lasla") sys.exit(0) diff --git a/tests/test_models/test_fro.py b/tests/test_models/test_fro.py index 9f80f6d..16d42ea 100644 --- a/tests/test_models/test_fro.py +++ b/tests/test_models/test_fro.py @@ -36,3 +36,14 @@ def test_elision_apostrophe_and_quote(self): self.assertEqual(out[1]["treated"], "q") self.assertEqual(out[-1]["form"], "'", "Last apostrophe is kept") # Ending and starting apostrophe are not reinserted for some reason. + + def test_tokenization_roman_number(self): + iterator, _ = get_iterator_and_processor() + self.assertEqual( + list(iterator.tokenizer.sentence_tokenizer("Les .XIII. tables du Duc du XII.. C'est fantastique")), + [ + ["Les", ".XIII.", "tables", "du", "Duc", "du", "XII", ".", "."], + ["C", 'est', "fantastique"] + ], + "Dots around roman number are not sentences markers" + ) \ No newline at end of file From 50d81e576ded28e1a112515697e78623c23b3510 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 21 Feb 2020 18:20:21 +0100 Subject: [PATCH 12/13] Added a lot of doctests --- .travis.yml | 2 +- pie_extended/pipeline/postprocessor/glue.py | 19 ++++- pie_extended/pipeline/postprocessor/memory.py | 26 +++++- pie_extended/pipeline/postprocessor/proto.py | 81 +++++++++++++++++-- .../pipeline/postprocessor/rulebased.py | 19 ++++- 5 files changed, 134 insertions(+), 13 deletions(-) diff --git a/.travis.yml b/.travis.yml index cc9cae4..7f428a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ install: # command to run tests script: - pie-extended install-addons lasla - - nosetests ./tests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture + - nosetests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture --with-doctest after_success: - coverage combine - coveralls \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/glue.py b/pie_extended/pipeline/postprocessor/glue.py index 6a54ed4..76cbf66 100644 --- a/pie_extended/pipeline/postprocessor/glue.py +++ b/pie_extended/pipeline/postprocessor/glue.py @@ -1,10 +1,27 @@ -from .proto import ProcessorPrototype, RenamedTaskProcessor +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, RenamedTaskProcessor from typing import Generator, Dict, List class GlueProcessor(RenamedTaskProcessor): """ Glues together specific tasks + >>> class SimpleGlue(GlueProcessor): + ... OUTPUT_KEYS = ["form", "lemma", "task3"] + ... GLUE = {"task3": ["1", "2"]} # Merges Task `1` output and task `2` output in `task3` + ... EMPTY_TAG = {"1": "_", "2": "_"} # If _ is tagged in task `1`, it's the same as an empty tag + ... GLUE_EMPTY = {"task3": "NO-DATA"} # When all merged data are empty, default value + >>> x = SimpleGlue() + >>> x.set_tasks(["lemma", "1", "2"]) + >>> # Merges b and c values from task 1 and 2 into a new task + >>> x.get_dict("a", ["a", "b", "c"]) == {"form": "a", "lemma": "a", "task3": "1=b|2=c"} + True + >>> # Keeps only one task because 2 is empty + >>> x.get_dict("a", ["a", "b", "_"]) == {"form": "a", "lemma": "a", "task3": "1=b"} + True + >>> # Fills with the default empty tag because both task 1 and 2 were empty + >>> x.get_dict("a", ["a", "_", "_"]) == {"form": "a", "lemma": "a", "task3": "NO-DATA"} + True + """ # Output keys are keys that are given in the end diff --git a/pie_extended/pipeline/postprocessor/memory.py b/pie_extended/pipeline/postprocessor/memory.py index 0ab69ae..618970e 100644 --- a/pie_extended/pipeline/postprocessor/memory.py +++ b/pie_extended/pipeline/postprocessor/memory.py @@ -1,4 +1,4 @@ -from .proto import ProcessorPrototype, ChainedProcessor +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, ChainedProcessor from typing import Optional, Dict, List if "typing" == "nottyping": from ..tokenizers.memorizing import MemorizingTokenizer @@ -9,6 +9,30 @@ class MemoryzingProcessor(ChainedProcessor): by reinserting the original data alongside a new task (KEY) where we output the input seen by the Model + It reuses the memory from a class derived from MemorizingTokenizer so that it reintroduced + the original input into the token. + + >>> from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer + >>> tokenizer = MemorizingTokenizer() + >>> # Fake token memory : (Index, Original Input, Input seen by Tagger) + >>> tokenizer.tokens = [(0, "A", "a"), (0, "b", "b"), (0, "q'", "q")] + >>> processor = MemoryzingProcessor(tokenizer_memory=tokenizer, head_processor=ProcessorPrototype()) + >>> processor.set_tasks(["lem"]) + >>> # Lowercase a was taken in the input but uppercase a is returned in form. For transparency, input seen + >>> # By the tagger is returned in a new column, treated (cf. MemorizingProcessor.KEY) + >>> processor.get_dict("a", ["lemma"]) == {"form": "A", "treated": "a", "lem": "lemma"} + True + >>> # Some would have the same treated and input + >>> processor.get_dict("b", ["lemma"]) == {"form": "b", "treated": "b", "lem": "lemma"} + True + >>> # Some differ with more characters + >>> processor.get_dict("q", ["lemma"]) == {"form": "q'", "treated": "q", "lem": "lemma"} + True + + This allows for easier output alignment as well as removing unknown characters to the model. If your lemmatizer + in training has never seen the "@" character, you can remove it at tokenization time and reinsert it with + MemoryzingProcessor + """ KEY: str = "treated" diff --git a/pie_extended/pipeline/postprocessor/proto.py b/pie_extended/pipeline/postprocessor/proto.py index d8a7e49..81dbcb3 100644 --- a/pie_extended/pipeline/postprocessor/proto.py +++ b/pie_extended/pipeline/postprocessor/proto.py @@ -8,6 +8,18 @@ class ProcessorPrototype: empty_value: str def __init__(self, empty_value: Optional[str] = None): + """ Applies postprocessing. Simplest Processor one could use. + + :param empty_value: Value to use to fill tasks that would not get any data + + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"} + True + >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"} + True + """ self.tasks = [] self.empty_value = empty_value or DEFAULT_EMPTY @@ -22,20 +34,36 @@ def reinsert(self, form: str) -> Dict[str, str]: :param form: Token to reinsert :return: Dictionary representation of the token, as an annotation + + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"} + True """ return dict(form=form, **{task: self.empty_value for task in self.tasks}) def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: """ Get the dictionary representation of a token annotation - :param token: - :param tags: - :return: + :param token: Token used as input for pie + :param tags: List of tags generated + :return: Dictionary representation of the token and its annotations + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"} + True """ return {"form": token, **{k: val for k, val in zip(self.tasks, tags)}} def reset(self): - """ Functions that should be run in between documents """ + """ Functions that should be run in between documents + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> x.reset() + """ pass @@ -43,6 +71,15 @@ class RenamedTaskProcessor(ProcessorPrototype): MAP: Dict[str, str] = {} def __init__(self, **kwargs): + """ This Processor is used for renaming tasks (Pie for example refuses tasks containing dots) + + >>> class ExampleRemaped(RenamedTaskProcessor): + ... MAP = {"task_name_1": "renamed"} + >>> x = ExampleRemaped() + >>> x.set_tasks(["task_name_1", "y"]) + >>> x.get_dict("token", ["a", "b"]) == {"form": "token", "renamed": "a", "y": "b"} + True + """ super(RenamedTaskProcessor, self).__init__(**kwargs) self._map: Dict[str, str] = type(self).MAP @@ -53,7 +90,39 @@ def set_tasks(self, tasks): class ChainedProcessor(ProcessorPrototype): """ Allows for easy chaining ! - ChainedProcessor(ProcessorPrototype) basically should behave like a normal processor + The ChainedProcessor is basically using its headprocessor in the background and checking it's output to some extent + + The prototype of ChainedProcessor using Processor Prototype would have the same results because + chained processor is not doing anything new except enabling chaining + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> y = ChainedProcessor(x) + >>> y.set_tasks(["a", "b"]) + >>> x.reinsert("x") == y.reinsert("x") + True + >>> x.get_dict("y", ["1", "2"]) == y.get_dict("y", ["1", "2"]) + True + + You can subclass it to modify the output of the preceding processor : + + >>> class ExampleChained(ChainedProcessor): + ... def reinsert(self, form: str) -> Dict[str, str]: + ... annotation = self.head_processor.reinsert(form) + ... annotation["col3"] = "x" + ... return annotation + ... + ... def get_dict(self, form: str, tags: List[str]) -> Dict[str, str]: + ... annotation = self.head_processor.get_dict(form, tags) + ... annotation["col3"] = "x" + ... return annotation + ... + >>> x = ExampleChained(ProcessorPrototype(empty_value="EMPTY")) + >>> x.set_tasks(["a", "b"]) + >>> x.reinsert("x") == {"form": "x", "a": "EMPTY", "b": "EMPTY", "col3": "x"} + True + >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2", "col3": "x"} + True """ head_processor: ProcessorPrototype @@ -76,4 +145,4 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: return self.head_processor.get_dict(token, tags) def reset(self): - self.head_processor.reset() \ No newline at end of file + self.head_processor.reset() diff --git a/pie_extended/pipeline/postprocessor/rulebased.py b/pie_extended/pipeline/postprocessor/rulebased.py index d2d5f1b..0977342 100644 --- a/pie_extended/pipeline/postprocessor/rulebased.py +++ b/pie_extended/pipeline/postprocessor/rulebased.py @@ -1,4 +1,4 @@ -from .proto import ProcessorPrototype, ChainedProcessor +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, ChainedProcessor from typing import Optional, Dict, List if "typing" == "nottyping": from ..tokenizers.memorizing import MemorizingTokenizer @@ -8,16 +8,27 @@ class RuleBasedProcessor(ChainedProcessor): """ Applies rules found in rules(token_annotation) """ - KEY: str = "treated" def __init__(self, apply_on_reinsert: bool = False, head_processor: Optional[ProcessorPrototype] = None, **kwargs): """ Apply rules on output of the taggers :param apply_on_reinsert: Apply rules on reinsert task + :param head_processor: Processor to use before post-processing its results + + >>> class ExampleRule(RuleBasedProcessor): + ... def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + ... if annotation["form"] == "need": + ... annotation["1"] = "REPLACED" + ... return annotation + >>> processor = ExampleRule() + >>> processor.set_tasks(["1", "2"]) + >>> processor.get_dict("token", ["a", "b"]) == {"form": "token", "1": "a", "2": "b"} + True + >>> processor.get_dict("need", ["a", "b"]) == {"form": "need", "1": "REPLACED", "2": "b"} + True """ super(RuleBasedProcessor, self).__init__(head_processor=head_processor, **kwargs) - self._key: str = type(self).KEY - self.apply_on_reinsert= apply_on_reinsert + self.apply_on_reinsert = apply_on_reinsert def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: return annotation From efbfc7da82edbdbefdedfddd6a71d989ae212a17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Sat, 22 Feb 2020 09:54:56 +0100 Subject: [PATCH 13/13] Added documentation on how to run the python API --- README.md | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/README.md b/README.md index 56fb775..e5a3526 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,12 @@ The current system provide an easier access to adding **customized**: - disambiguation, - output formatting +## Install + +To install, simply do `pip install pie-extended`. Then, look at all available models. + +## Run on terminal + But on top of that, it provides a quick and easy way to use others models ! For example, in a shell : ```bash @@ -26,6 +32,53 @@ pie-extended tag laslsa your_file.txt will give you access to all you need ! +## Python API + +You can run the lemmatizer in your own scripts and retrieve token annotations as dictionaries: + +```python +from typing import List +from pie_extended.cli.sub import get_tagger, get_model, download + +# In case you need to download +do_download = False +if do_download: + for dl in download("lasla"): + x = 1 + +# model_path allows you to override the model loaded by another .tar +model_name = "lasla" +tagger = get_tagger(model_name, batch_size=256, device="cpu", model_path=None) + +sentences: List[str] = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit. "] +# Get the main object from the model (: data iterator + postprocesor +from pie_extended.models.lasla import get_iterator_and_processor +for sentence_group in sentences: + iterator, processor = get_iterator_and_processor() + print(tagger.tag_str(sentence_group, iterator=iterator, processor=processor) ) +``` + +will result in + +```python +[{'form': 'lorem', 'lemma': 'lor', 'POS': 'NOMcom', 'morph': 'Case=Acc|Numb=Sing', 'treated': 'lorem'}, + {'form': 'ipsum', 'lemma': 'ipse', 'POS': 'PROdem', 'morph': 'Case=Acc|Numb=Sing', 'treated': 'ipsum'}, + {'form': 'dolor', 'lemma': 'dolor', 'POS': 'NOMcom', 'morph': 'Case=Nom|Numb=Sing', 'treated': 'dolor'}, + {'form': 'sit', 'lemma': 'sum1', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Act|Person=3', + 'treated': 'sit'}, + {'form': 'amet', 'lemma': 'amo', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Act|Person=3', + 'treated': 'amet'}, {'form': ',', 'lemma': ',', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': ','}, + {'form': 'consectetur', 'lemma': 'consector2', 'POS': 'VER', + 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Dep|Person=3', 'treated': 'consectetur'}, + {'form': 'adipiscing', 'lemma': 'adipiscor', 'POS': 'VER', 'morph': 'Tense=Pres|Voice=Dep', 'treated': 'adipiscing'}, + {'form': 'elit', 'lemma': 'elio', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Ind|Tense=Pres|Voice=Act|Person=3', + 'treated': 'elit'}, {'form': '.', 'lemma': '.', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '.'}] +``` + +## Add a model + +ToDo: Documentation + ## Warning This is an extremely early build, subject to change here and there. But it is functional ! \ No newline at end of file