Added a lot of doctests

hipster-philology · Feb 21, 2020 · 50d81e5 · 50d81e5
1 parent 01c9dd4
commit 50d81e5
Show file tree

Hide file tree

Showing 5 changed files with 134 additions and 13 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -12,7 +12,7 @@ install:
 # command to run tests
 script:
   - pie-extended install-addons lasla
-  - nosetests ./tests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture
+  - nosetests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture --with-doctest
 after_success:
   - coverage combine
   - coveralls
diff --git a/pie_extended/pipeline/postprocessor/glue.py b/pie_extended/pipeline/postprocessor/glue.py
@@ -1,10 +1,27 @@
-from .proto import ProcessorPrototype, RenamedTaskProcessor
+from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, RenamedTaskProcessor
 from typing import Generator, Dict, List
 
 
 class GlueProcessor(RenamedTaskProcessor):
     """ Glues together specific tasks
 
+    >>> class SimpleGlue(GlueProcessor):
+    ...     OUTPUT_KEYS = ["form", "lemma", "task3"]
+    ...     GLUE = {"task3": ["1", "2"]} # Merges Task `1` output and task `2` output in `task3`
+    ...     EMPTY_TAG = {"1": "_", "2": "_"} # If _ is tagged in task `1`, it's the same as an empty tag
+    ...     GLUE_EMPTY = {"task3": "NO-DATA"}  # When all merged data are empty, default value
+    >>> x = SimpleGlue()
+    >>> x.set_tasks(["lemma", "1", "2"])
+    >>> # Merges b and c values from task 1 and 2 into a new task
+    >>> x.get_dict("a", ["a", "b", "c"]) == {"form": "a", "lemma": "a", "task3": "1=b|2=c"}
+    True
+    >>> # Keeps only one task because 2 is empty
+    >>> x.get_dict("a", ["a", "b", "_"]) == {"form": "a", "lemma": "a", "task3": "1=b"}
+    True
+    >>> # Fills with the default empty tag because both task 1 and 2 were empty
+    >>> x.get_dict("a", ["a", "_", "_"]) == {"form": "a", "lemma": "a", "task3": "NO-DATA"}
+    True
+
     """
 
     # Output keys are keys that are given in the end

diff --git a/pie_extended/pipeline/postprocessor/memory.py b/pie_extended/pipeline/postprocessor/memory.py
@@ -1,4 +1,4 @@
-from .proto import ProcessorPrototype, ChainedProcessor
+from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, ChainedProcessor
 from typing import Optional, Dict, List
 if "typing" == "nottyping":
     from ..tokenizers.memorizing import MemorizingTokenizer
@@ -9,6 +9,30 @@ class MemoryzingProcessor(ChainedProcessor):
     by reinserting the original data alongside a new task (KEY) where we output
     the input seen by the Model
 
+    It reuses the memory from a class derived from MemorizingTokenizer so that it reintroduced
+    the original input into the token.
+
+    >>> from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer
+    >>> tokenizer = MemorizingTokenizer()
+    >>> # Fake token memory : (Index, Original Input, Input seen by Tagger)
+    >>> tokenizer.tokens = [(0, "A", "a"), (0, "b", "b"), (0, "q'", "q")]
+    >>> processor = MemoryzingProcessor(tokenizer_memory=tokenizer, head_processor=ProcessorPrototype())
+    >>> processor.set_tasks(["lem"])
+    >>> # Lowercase a was taken in the input but uppercase a is returned in form. For transparency, input seen
+    >>> #   By the tagger is returned in a new column, treated (cf. MemorizingProcessor.KEY)
+    >>> processor.get_dict("a", ["lemma"]) == {"form": "A", "treated": "a", "lem": "lemma"}
+    True
+    >>> # Some would have the same treated and input
+    >>> processor.get_dict("b", ["lemma"]) == {"form": "b", "treated": "b", "lem": "lemma"}
+    True
+    >>> # Some differ with more characters
+    >>> processor.get_dict("q", ["lemma"]) == {"form": "q'", "treated": "q", "lem": "lemma"}
+    True
+
+    This allows for easier output alignment as well as removing unknown characters to the model. If your lemmatizer
+    in training has never seen the "@" character, you can remove it at tokenization time and reinsert it with
+    MemoryzingProcessor
+
     """
     KEY: str = "treated"
 

diff --git a/pie_extended/pipeline/postprocessor/proto.py b/pie_extended/pipeline/postprocessor/proto.py
@@ -8,6 +8,18 @@ class ProcessorPrototype:
     empty_value: str
 
     def __init__(self, empty_value: Optional[str] = None):
+        """ Applies postprocessing. Simplest Processor one could use.
+
+        :param empty_value: Value to use to fill tasks that would not get any data
+
+
+        >>> x = ProcessorPrototype(empty_value="%")
+        >>> x.set_tasks(["a", "b"])
+        >>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"}
+        True
+        >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"}
+        True
+        """
         self.tasks = []
         self.empty_value = empty_value or DEFAULT_EMPTY
 
@@ -22,27 +34,52 @@ def reinsert(self, form: str) -> Dict[str, str]:
 
         :param form: Token to reinsert
         :return: Dictionary representation of the token, as an annotation
+
+
+        >>> x = ProcessorPrototype(empty_value="%")
+        >>> x.set_tasks(["a", "b"])
+        >>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"}
+        True
         """
         return dict(form=form, **{task: self.empty_value for task in self.tasks})
 
     def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
         """ Get the dictionary representation of a token annotation
 
-        :param token:
-        :param tags:
-        :return:
+        :param token: Token used as input for pie
+        :param tags: List of tags generated
+        :return: Dictionary representation of the token and its annotations
+
+        >>> x = ProcessorPrototype(empty_value="%")
+        >>> x.set_tasks(["a", "b"])
+        >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"}
+        True
         """
         return {"form": token, **{k: val for k, val in zip(self.tasks, tags)}}
 
     def reset(self):
-        """ Functions that should be run in between documents """
+        """ Functions that should be run in between documents
+
+        >>> x = ProcessorPrototype(empty_value="%")
+        >>> x.set_tasks(["a", "b"])
+        >>> x.reset()
+        """
         pass
 
 
 class RenamedTaskProcessor(ProcessorPrototype):
     MAP: Dict[str, str] = {}
 
     def __init__(self, **kwargs):
+        """ This Processor is used for renaming tasks (Pie for example refuses tasks containing dots)
+
+        >>> class ExampleRemaped(RenamedTaskProcessor):
+        ...    MAP = {"task_name_1": "renamed"}
+        >>> x = ExampleRemaped()
+        >>> x.set_tasks(["task_name_1", "y"])
+        >>> x.get_dict("token", ["a", "b"]) == {"form": "token", "renamed": "a", "y": "b"}
+        True
+        """
         super(RenamedTaskProcessor, self).__init__(**kwargs)
         self._map: Dict[str, str] = type(self).MAP
 
@@ -53,7 +90,39 @@ def set_tasks(self, tasks):
 class ChainedProcessor(ProcessorPrototype):
     """ Allows for easy chaining !
 
-    ChainedProcessor(ProcessorPrototype) basically should behave like a normal processor
+    The ChainedProcessor is basically using its headprocessor in the background and checking it's output to some extent
+
+    The prototype of ChainedProcessor using Processor Prototype would have the same results because
+    chained processor is not doing anything new except enabling chaining
+
+        >>> x = ProcessorPrototype(empty_value="%")
+        >>> x.set_tasks(["a", "b"])
+        >>> y = ChainedProcessor(x)
+        >>> y.set_tasks(["a", "b"])
+        >>> x.reinsert("x") == y.reinsert("x")
+        True
+        >>> x.get_dict("y", ["1", "2"]) == y.get_dict("y", ["1", "2"])
+        True
+
+    You can subclass it to modify the output of the preceding processor :
+
+    >>> class ExampleChained(ChainedProcessor):
+    ...     def reinsert(self, form: str) -> Dict[str, str]:
+    ...         annotation = self.head_processor.reinsert(form)
+    ...         annotation["col3"] = "x"
+    ...         return annotation
+    ...
+    ...     def get_dict(self, form: str, tags: List[str]) -> Dict[str, str]:
+    ...         annotation = self.head_processor.get_dict(form, tags)
+    ...         annotation["col3"] = "x"
+    ...         return annotation
+    ...
+    >>> x = ExampleChained(ProcessorPrototype(empty_value="EMPTY"))
+    >>> x.set_tasks(["a", "b"])
+    >>> x.reinsert("x") == {"form": "x", "a": "EMPTY", "b": "EMPTY", "col3": "x"}
+    True
+    >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2", "col3": "x"}
+    True
 
     """
     head_processor: ProcessorPrototype
@@ -76,4 +145,4 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
         return self.head_processor.get_dict(token, tags)
 
     def reset(self):
-        self.head_processor.reset()
+        self.head_processor.reset()
diff --git a/pie_extended/pipeline/postprocessor/rulebased.py b/pie_extended/pipeline/postprocessor/rulebased.py
@@ -1,4 +1,4 @@
-from .proto import ProcessorPrototype, ChainedProcessor
+from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, ChainedProcessor
 from typing import Optional, Dict, List
 if "typing" == "nottyping":
     from ..tokenizers.memorizing import MemorizingTokenizer
@@ -8,16 +8,27 @@ class RuleBasedProcessor(ChainedProcessor):
     """ Applies rules found in rules(token_annotation)
 
     """
-    KEY: str = "treated"
 
     def __init__(self, apply_on_reinsert: bool = False, head_processor: Optional[ProcessorPrototype] = None, **kwargs):
         """ Apply rules on output of the taggers
 
         :param apply_on_reinsert: Apply rules on reinsert task
+        :param head_processor: Processor to use before post-processing its results
+
+        >>> class ExampleRule(RuleBasedProcessor):
+        ...     def rules(self, annotation: Dict[str, str]) -> Dict[str, str]:
+        ...         if annotation["form"] == "need":
+        ...             annotation["1"] = "REPLACED"
+        ...         return annotation
+        >>> processor = ExampleRule()
+        >>> processor.set_tasks(["1", "2"])
+        >>> processor.get_dict("token", ["a", "b"]) == {"form": "token", "1": "a", "2": "b"}
+        True
+        >>> processor.get_dict("need", ["a", "b"]) == {"form": "need", "1": "REPLACED", "2": "b"}
+        True
         """
         super(RuleBasedProcessor, self).__init__(head_processor=head_processor, **kwargs)
-        self._key: str = type(self).KEY
-        self.apply_on_reinsert= apply_on_reinsert
+        self.apply_on_reinsert = apply_on_reinsert
 
     def rules(self, annotation: Dict[str, str]) -> Dict[str, str]:
         return annotation