Skip to content

Commit

Permalink
Added a lot of doctests
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Feb 21, 2020
1 parent 01c9dd4 commit 50d81e5
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 13 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ install:
# command to run tests
script:
- pie-extended install-addons lasla
- nosetests ./tests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture
- nosetests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture --with-doctest
after_success:
- coverage combine
- coveralls
19 changes: 18 additions & 1 deletion pie_extended/pipeline/postprocessor/glue.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,27 @@
from .proto import ProcessorPrototype, RenamedTaskProcessor
from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, RenamedTaskProcessor
from typing import Generator, Dict, List


class GlueProcessor(RenamedTaskProcessor):
""" Glues together specific tasks
>>> class SimpleGlue(GlueProcessor):
... OUTPUT_KEYS = ["form", "lemma", "task3"]
... GLUE = {"task3": ["1", "2"]} # Merges Task `1` output and task `2` output in `task3`
... EMPTY_TAG = {"1": "_", "2": "_"} # If _ is tagged in task `1`, it's the same as an empty tag
... GLUE_EMPTY = {"task3": "NO-DATA"} # When all merged data are empty, default value
>>> x = SimpleGlue()
>>> x.set_tasks(["lemma", "1", "2"])
>>> # Merges b and c values from task 1 and 2 into a new task
>>> x.get_dict("a", ["a", "b", "c"]) == {"form": "a", "lemma": "a", "task3": "1=b|2=c"}
True
>>> # Keeps only one task because 2 is empty
>>> x.get_dict("a", ["a", "b", "_"]) == {"form": "a", "lemma": "a", "task3": "1=b"}
True
>>> # Fills with the default empty tag because both task 1 and 2 were empty
>>> x.get_dict("a", ["a", "_", "_"]) == {"form": "a", "lemma": "a", "task3": "NO-DATA"}
True
"""

# Output keys are keys that are given in the end
Expand Down
26 changes: 25 additions & 1 deletion pie_extended/pipeline/postprocessor/memory.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .proto import ProcessorPrototype, ChainedProcessor
from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, ChainedProcessor
from typing import Optional, Dict, List
if "typing" == "nottyping":
from ..tokenizers.memorizing import MemorizingTokenizer
Expand All @@ -9,6 +9,30 @@ class MemoryzingProcessor(ChainedProcessor):
by reinserting the original data alongside a new task (KEY) where we output
the input seen by the Model
It reuses the memory from a class derived from MemorizingTokenizer so that it reintroduced
the original input into the token.
>>> from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer
>>> tokenizer = MemorizingTokenizer()
>>> # Fake token memory : (Index, Original Input, Input seen by Tagger)
>>> tokenizer.tokens = [(0, "A", "a"), (0, "b", "b"), (0, "q'", "q")]
>>> processor = MemoryzingProcessor(tokenizer_memory=tokenizer, head_processor=ProcessorPrototype())
>>> processor.set_tasks(["lem"])
>>> # Lowercase a was taken in the input but uppercase a is returned in form. For transparency, input seen
>>> # By the tagger is returned in a new column, treated (cf. MemorizingProcessor.KEY)
>>> processor.get_dict("a", ["lemma"]) == {"form": "A", "treated": "a", "lem": "lemma"}
True
>>> # Some would have the same treated and input
>>> processor.get_dict("b", ["lemma"]) == {"form": "b", "treated": "b", "lem": "lemma"}
True
>>> # Some differ with more characters
>>> processor.get_dict("q", ["lemma"]) == {"form": "q'", "treated": "q", "lem": "lemma"}
True
This allows for easier output alignment as well as removing unknown characters to the model. If your lemmatizer
in training has never seen the "@" character, you can remove it at tokenization time and reinsert it with
MemoryzingProcessor
"""
KEY: str = "treated"

Expand Down
81 changes: 75 additions & 6 deletions pie_extended/pipeline/postprocessor/proto.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,18 @@ class ProcessorPrototype:
empty_value: str

def __init__(self, empty_value: Optional[str] = None):
""" Applies postprocessing. Simplest Processor one could use.
:param empty_value: Value to use to fill tasks that would not get any data
>>> x = ProcessorPrototype(empty_value="%")
>>> x.set_tasks(["a", "b"])
>>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"}
True
>>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"}
True
"""
self.tasks = []
self.empty_value = empty_value or DEFAULT_EMPTY

Expand All @@ -22,27 +34,52 @@ def reinsert(self, form: str) -> Dict[str, str]:
:param form: Token to reinsert
:return: Dictionary representation of the token, as an annotation
>>> x = ProcessorPrototype(empty_value="%")
>>> x.set_tasks(["a", "b"])
>>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"}
True
"""
return dict(form=form, **{task: self.empty_value for task in self.tasks})

def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
""" Get the dictionary representation of a token annotation
:param token:
:param tags:
:return:
:param token: Token used as input for pie
:param tags: List of tags generated
:return: Dictionary representation of the token and its annotations
>>> x = ProcessorPrototype(empty_value="%")
>>> x.set_tasks(["a", "b"])
>>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"}
True
"""
return {"form": token, **{k: val for k, val in zip(self.tasks, tags)}}

def reset(self):
""" Functions that should be run in between documents """
""" Functions that should be run in between documents
>>> x = ProcessorPrototype(empty_value="%")
>>> x.set_tasks(["a", "b"])
>>> x.reset()
"""
pass


class RenamedTaskProcessor(ProcessorPrototype):
MAP: Dict[str, str] = {}

def __init__(self, **kwargs):
""" This Processor is used for renaming tasks (Pie for example refuses tasks containing dots)
>>> class ExampleRemaped(RenamedTaskProcessor):
... MAP = {"task_name_1": "renamed"}
>>> x = ExampleRemaped()
>>> x.set_tasks(["task_name_1", "y"])
>>> x.get_dict("token", ["a", "b"]) == {"form": "token", "renamed": "a", "y": "b"}
True
"""
super(RenamedTaskProcessor, self).__init__(**kwargs)
self._map: Dict[str, str] = type(self).MAP

Expand All @@ -53,7 +90,39 @@ def set_tasks(self, tasks):
class ChainedProcessor(ProcessorPrototype):
""" Allows for easy chaining !
ChainedProcessor(ProcessorPrototype) basically should behave like a normal processor
The ChainedProcessor is basically using its headprocessor in the background and checking it's output to some extent
The prototype of ChainedProcessor using Processor Prototype would have the same results because
chained processor is not doing anything new except enabling chaining
>>> x = ProcessorPrototype(empty_value="%")
>>> x.set_tasks(["a", "b"])
>>> y = ChainedProcessor(x)
>>> y.set_tasks(["a", "b"])
>>> x.reinsert("x") == y.reinsert("x")
True
>>> x.get_dict("y", ["1", "2"]) == y.get_dict("y", ["1", "2"])
True
You can subclass it to modify the output of the preceding processor :
>>> class ExampleChained(ChainedProcessor):
... def reinsert(self, form: str) -> Dict[str, str]:
... annotation = self.head_processor.reinsert(form)
... annotation["col3"] = "x"
... return annotation
...
... def get_dict(self, form: str, tags: List[str]) -> Dict[str, str]:
... annotation = self.head_processor.get_dict(form, tags)
... annotation["col3"] = "x"
... return annotation
...
>>> x = ExampleChained(ProcessorPrototype(empty_value="EMPTY"))
>>> x.set_tasks(["a", "b"])
>>> x.reinsert("x") == {"form": "x", "a": "EMPTY", "b": "EMPTY", "col3": "x"}
True
>>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2", "col3": "x"}
True
"""
head_processor: ProcessorPrototype
Expand All @@ -76,4 +145,4 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
return self.head_processor.get_dict(token, tags)

def reset(self):
self.head_processor.reset()
self.head_processor.reset()
19 changes: 15 additions & 4 deletions pie_extended/pipeline/postprocessor/rulebased.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .proto import ProcessorPrototype, ChainedProcessor
from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, ChainedProcessor
from typing import Optional, Dict, List
if "typing" == "nottyping":
from ..tokenizers.memorizing import MemorizingTokenizer
Expand All @@ -8,16 +8,27 @@ class RuleBasedProcessor(ChainedProcessor):
""" Applies rules found in rules(token_annotation)
"""
KEY: str = "treated"

def __init__(self, apply_on_reinsert: bool = False, head_processor: Optional[ProcessorPrototype] = None, **kwargs):
""" Apply rules on output of the taggers
:param apply_on_reinsert: Apply rules on reinsert task
:param head_processor: Processor to use before post-processing its results
>>> class ExampleRule(RuleBasedProcessor):
... def rules(self, annotation: Dict[str, str]) -> Dict[str, str]:
... if annotation["form"] == "need":
... annotation["1"] = "REPLACED"
... return annotation
>>> processor = ExampleRule()
>>> processor.set_tasks(["1", "2"])
>>> processor.get_dict("token", ["a", "b"]) == {"form": "token", "1": "a", "2": "b"}
True
>>> processor.get_dict("need", ["a", "b"]) == {"form": "need", "1": "REPLACED", "2": "b"}
True
"""
super(RuleBasedProcessor, self).__init__(head_processor=head_processor, **kwargs)
self._key: str = type(self).KEY
self.apply_on_reinsert= apply_on_reinsert
self.apply_on_reinsert = apply_on_reinsert

def rules(self, annotation: Dict[str, str]) -> Dict[str, str]:
return annotation
Expand Down

0 comments on commit 50d81e5

Please sign in to comment.