Skip to content

Commit

Permalink
Added more tests to check for configuration with a new FakeTagget + W…
Browse files Browse the repository at this point in the history
…orking exclude patterns
  • Loading branch information
PonteIneptique committed Feb 25, 2020
1 parent f278d4a commit 2484287
Show file tree
Hide file tree
Showing 13 changed files with 249 additions and 36 deletions.
1 change: 0 additions & 1 deletion pie_extended/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def tag(model: str, filepath: str, allowed_failure: bool, batch_size: int, devic
model_path: str,
reset_patterns: bool, add_pattern: Iterable[str]):
""" Tag as many [filepath] as you want with [model] """
print(reset_patterns, add_pattern)
from tqdm import tqdm
click.echo(click.style("Getting the tagger", bold=True))
try:
Expand Down
5 changes: 3 additions & 2 deletions pie_extended/models/fro/get.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .processor import FroRulesProcessor, FroGlueProcessor, FroMapProcessor
from .processor import FroRulesProcessor, FroGlueProcessor
from pie_extended.pipeline.postprocessor.proto import RenamedTaskProcessor
from .tokenizer import FroMemorizingTokenizer
from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns
from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor
Expand All @@ -11,7 +12,7 @@ def get_iterator_and_processor():
head_processor=MemoryzingProcessor(
tokenizer_memory=tokenizer,
head_processor=FroGlueProcessor(
head_processor=FroMapProcessor()
head_processor=RenamedTaskProcessor({"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."})
)
)
)
Expand Down
9 changes: 1 addition & 8 deletions pie_extended/models/fro/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,4 @@ class FroGlueProcessor(GlueProcessor):
"PERS.": "_"}

def __init__(self, *args, **kwargs):
super(FroGlueProcessor, self).__init__(*args, **kwargs)


class FroMapProcessor(RenamedTaskProcessor):
MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."}

def __init__(self, *args, **kwargs):
super(FroMapProcessor, self).__init__(*args, **kwargs)
super(FroGlueProcessor, self).__init__(*args, **kwargs)
5 changes: 3 additions & 2 deletions pie_extended/models/lasla/get.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import regex as re

from pie_extended.models.lasla.processor import LatinRulesProcessor, LatinGlueProcessor, LatinMapProcessor
from pie_extended.models.lasla.processor import LatinRulesProcessor, LatinGlueProcessor
from pie_extended.pipeline.postprocessor.proto import RenamedTaskProcessor, ProcessorPrototype
from pie_extended.models.lasla.tokenizer import LatMemorizingTokenizer
from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns
from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor
Expand All @@ -16,7 +17,7 @@ def get_iterator_and_processor():
head_processor=MemoryzingProcessor(
tokenizer_memory=tokenizer,
head_processor=LatinGlueProcessor(
LatinMapProcessor()
ProcessorPrototype()
)
)
)
Expand Down
10 changes: 1 addition & 9 deletions pie_extended/models/lasla/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from pie_extended.pipeline.postprocessor.glue import GlueProcessor
from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor
from pie_extended.pipeline.postprocessor.proto import RenamedTaskProcessor


class LatinRulesProcessor(RuleBasedProcessor):
Expand Down Expand Up @@ -31,16 +30,9 @@ def __init__(self, *args, **kwargs):


class LatinGlueProcessor(GlueProcessor):
OUTPUT_KEYS = ["form", "lemma", "POS", "morph"]
OUTPUT_KEYS = ["form", "lemma", "pos", "morph"]
GLUE = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]}
WHEN_EMPTY = {"morph": "MORPH=empty"}

def __init__(self, *args, **kwargs):
super(LatinGlueProcessor, self).__init__(*args, **kwargs)


class LatinMapProcessor(RenamedTaskProcessor):
MAP = {"pos": "POS"}

def __init__(self, *args, **kwargs):
super(LatinMapProcessor, self).__init__(*args, **kwargs)
16 changes: 15 additions & 1 deletion pie_extended/pipeline/postprocessor/glue.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from pie_extended.pipeline.postprocessor.proto import ChainedProcessor, ProcessorPrototype
from pie_extended.pipeline.postprocessor.proto import ChainedProcessor, ProcessorPrototype, RenamedTaskProcessor
from typing import Generator, Dict, List


Expand All @@ -23,6 +23,20 @@ class GlueProcessor(ChainedProcessor):
>>> x.get_dict("a", ["a", "_", "_"]) == {"form": "a", "lemma": "a", "task3": "NO-DATA"}
True
You can also use remaped tasks:
>>> class AnotherGlue(GlueProcessor):
... OUTPUT_KEYS = ["form", "lemma", "POS", "task3"]
... GLUE = {"task3": ["1", "2"]} # Merges Task `1` output and task `2` output in `task3`
... EMPTY_TAG = {"1": "_", "2": "_"} # If _ is tagged in task `1`, it's the same as an empty tag
... GLUE_EMPTY = {"task3": "NO-DATA"} # When all merged data are empty, default value
>>> x = AnotherGlue(head_processor=RenamedTaskProcessor({"pos": "POS"}))
>>> x.set_tasks(["lemma", "pos", "1", "2"]) # You can see things are remaped
['lemma', 'POS', 'task3']
>>> # Merges b and c values from task 1 and 2 into a new task
>>> x.get_dict("a", ["a", "p", "b", "c"])
{'form': 'a', 'lemma': 'a', 'POS': 'p', 'task3': '1=b|2=c'}
"""

# Output keys are keys that are given in the end
Expand Down
5 changes: 3 additions & 2 deletions pie_extended/pipeline/postprocessor/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,11 @@ class MemoryzingProcessor(ChainedProcessor):
"""
KEY: str = "treated"

def __init__(self, tokenizer_memory: "MemorizingTokenizer", head_processor: Optional[ProcessorPrototype], **kwargs):
def __init__(self, tokenizer_memory: "MemorizingTokenizer", head_processor: ProcessorPrototype,
key: Optional[str] = None, **kwargs):
super(MemoryzingProcessor, self).__init__(head_processor=head_processor, **kwargs)
self.memory: "MemorizingTokenizer" = tokenizer_memory
self._key: str = type(self).KEY
self._key: str = key or type(self).KEY

def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
# First we get the dictionary
Expand Down
6 changes: 2 additions & 4 deletions pie_extended/pipeline/postprocessor/proto.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,7 @@ def reset(self):


class RenamedTaskProcessor(ProcessorPrototype):
MAP: Dict[str, str] = {}

def __init__(self, **kwargs):
def __init__(self, task_map: Dict[str, str], **kwargs):
""" This Processor is used for renaming tasks (Pie for example refuses tasks containing dots)
>>> class ExampleRemaped(RenamedTaskProcessor):
Expand All @@ -90,7 +88,7 @@ def __init__(self, **kwargs):
True
"""
super(RenamedTaskProcessor, self).__init__(**kwargs)
self._map: Dict[str, str] = type(self).MAP
self._map: Dict[str, str] = task_map

def set_tasks(self, tasks):
self._tasks = [self._map.get(task, task) for task in tasks]
Expand Down
15 changes: 10 additions & 5 deletions pie_extended/tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,13 @@ def tag_file(self, fpath: str, iterator: DataIterator, processor: ProcessorProto

_, ext = os.path.splitext(fpath)

with open(utils.ensure_ext(fpath, ext, 'pie'), 'w+') as f:
out_file = utils.ensure_ext(fpath, ext, 'pie')
with open(out_file, 'w+') as f:
for line in self.iter_tag(data, iterator, processor=processor):
f.write(line)

return out_file

def tag_str(self, data: str, iterator: DataIterator, processor: ProcessorPrototype) -> str:
return list(self.iter_tag_token(data, iterator, processor=processor))

Expand All @@ -42,15 +45,19 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor
for chunk in utils.chunks(
iterator(data, lower=self.lower),
size=self.batch_size):

# Unzip the batch into the sentences, their sizes and the dictionaries of things that needs
# to be reinserted

sents, lengths, needs_reinsertion = zip(*chunk)

is_empty = [not bool(sent) for sent in sents]

tagged, tasks = self.tag(
sents=[sent for sent in sents if sent],
lengths=lengths
lengths=[l for l in lengths if l != 0]
)

if not processor.tasks:
processor.set_tasks(tasks)

Expand All @@ -65,7 +72,7 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor
sent_reinsertion = needs_reinsertion[sents_index]

# If we have a disambiguator, we run the results into it
if self.disambiguation:
if self.disambiguation and sent:
sent = self.disambiguation(sent, tasks)

reinsertion_index = 0
Expand All @@ -86,11 +93,9 @@ def iter_tag(self, data: str, iterator: DataIterator, processor: ProcessorProtot
formatter = None

for annotation in self.iter_tag_token(data, iterator, processor):
print(processor, processor.tasks)
if not formatter:
formatter = Formatter(processor.tasks)
yield formatter.write_headers()
print(annotation)
yield formatter.write_line(formatter.format_line(annotation))

if formatter:
Expand Down
71 changes: 71 additions & 0 deletions pie_extended/testing_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from typing import List, Tuple
from pie_extended.pipeline.iterators.proto import DataIterator
from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype
from pie_extended.tagger import ExtensibleTagger
from pie.utils import model_spec


class FakeTagger(ExtensibleTagger):
Expand All @@ -14,3 +18,70 @@ def tag(self, sents, **kwargs):
self.seen.extend(sents)

return self.tokens, self.tasks


class FakeAutoTag(ExtensibleTagger):
def __init__(self, tasks: List[str], **kwargs):
self.tokens: List[str] = []
self.lengths: List[int] = []
self.tasks = tasks
for key in kwargs:
setattr(self, key, kwargs[key])

def tag(self, sents: List[List[str]], lengths: List[int], *args, **kwargs):
""" Fake tagging tokens by enumerating informations
>>> tagger = FakeAutoTag(["pos", "lemma"])
>>> tagger.tag([['a', 'b'], ['c']], lengths=[2, 1])
([[('a', ('pos0', 'lemma0')), ('b', ('pos1', 'lemma1'))], [('c', ('pos2', 'lemma2'))]], ['pos', 'lemma'])
"""
self.tokens.extend(list(sents))
self.lengths.extend(lengths)

for t, l in zip(sents, lengths):
if len(t) != l:
raise ValueError("Tokens and lengths are inequal [len({}) != {}]".format(str(t), l))

out = []
total = 0

def get_task(task, i):
return task+str(i)

for sent in sents:
out.append([])
for tok in sent:
out[-1].append((tok, tuple(list(get_task(task, total) for task in self.tasks))))
total += 1
return out, self.tasks

@staticmethod
def from_model_string(model_string: str, **kwargs) -> "FakeAutoTag":
"""
:param model_string:
:return:
>>> tagger = FakeAutoTag.from_model_string("<path/to/tar,MODE,TEMPS,PERS,NOMB><path/to/tar,lemma,pos>")
>>> tagger.tasks
['MODE', 'TEMPS', 'PERS', 'NOMB', 'lemma', 'pos']
"""
return FakeAutoTag(tasks=[
task
for _, tasks in model_spec(model_string)
for task in tasks
], **kwargs)


def create_auto_tagger(module, **kwargs) -> Tuple[FakeAutoTag, DataIterator, ProcessorPrototype]:
""" Create a tagger as well as the iterator """
tagger = FakeAutoTag.from_model_string(module.Models, batch_size=16, **kwargs)

disambiguator = getattr(module, "Disambiguator", None)
if hasattr(disambiguator, "create"):
disambiguator = disambiguator.create()
tagger.disambiguation = disambiguator

iterator, processor = module.get_iterator_and_processor()
return tagger, iterator, processor
55 changes: 55 additions & 0 deletions tests/test_models/crazy_text_file-pie.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
token lemma pos morph
\ \ PUNC MORPH=empty
\ \ PUNC MORPH=empty
< < PUNC MORPH=empty
1 lemma0 pos0 Case=Case0|Numb=Numb0|Deg=Deg0|Mood=Mood0|Tense=Tense0|Voice=Voice0|Person=Person0
> > PUNC MORPH=empty
[ [ PUNC MORPH=empty
$ $ PUNC MORPH=empty
@ @ PUNC MORPH=empty
$ $ PUNC MORPH=empty
] ] PUNC MORPH=empty
( ( PUNC MORPH=empty
v lemma1 pos1 Case=Case1|Numb=Numb1|Deg=Deg1|Mood=Mood1|Tense=Tense1|Voice=Voice1|Person=Person1
) ) PUNC MORPH=empty
\ \ PUNC MORPH=empty
\ \ PUNC MORPH=empty
§ § PUNC MORPH=empty
\ \ PUNC MORPH=empty
< < PUNC MORPH=empty
1 lemma2 pos2 Case=Case2|Numb=Numb2|Deg=Deg2|Mood=Mood2|Tense=Tense2|Voice=Voice2|Person=Person2
> > PUNC MORPH=empty
[ [ PUNC MORPH=empty
$ $ PUNC MORPH=empty
@ @ PUNC MORPH=empty
$ $ PUNC MORPH=empty
] ] PUNC MORPH=empty
\ \ PUNC MORPH=empty
§ § PUNC MORPH=empty
§ § PUNC MORPH=empty
\ \ PUNC MORPH=empty
[ [ PUNC MORPH=empty
i lemma3 pos3 Case=Case3|Numb=Numb3|Deg=Deg3|Mood=Mood3|Tense=Tense3|Voice=Voice3|Person=Person3
] ] PUNC MORPH=empty
\ \ PUNC MORPH=empty
§ § PUNC MORPH=empty
en lemma4 pos4 Case=Case4|Numb=Numb4|Deg=Deg4|Mood=Mood4|Tense=Tense4|Voice=Voice4|Person=Person4
honor lemma5 pos5 Case=Case5|Numb=Numb5|Deg=Deg5|Mood=Mood5|Tense=Tense5|Voice=Voice5|Person=Person5
et lemma6 pos6 Case=Case6|Numb=Numb6|Deg=Deg6|Mood=Mood6|Tense=Tense6|Voice=Voice6|Person=Person6
en lemma7 pos7 Case=Case7|Numb=Numb7|Deg=Deg7|Mood=Mood7|Tense=Tense7|Voice=Voice7|Person=Person7
bie lemma8 pos8 Case=Case8|Numb=Numb8|Deg=Deg8|Mood=Mood8|Tense=Tense8|Voice=Voice8|Person=Person8
-ne ne2 pos9 Case=Case9|Numb=Numb9|Deg=Deg9|Mood=Mood9|Tense=Tense9|Voice=Voice9|Person=Person9
et lemma10 pos10 Case=Case10|Numb=Numb10|Deg=Deg10|Mood=Mood10|Tense=Tense10|Voice=Voice10|Person=Person10
en lemma11 pos11 Case=Case11|Numb=Numb11|Deg=Deg11|Mood=Mood11|Tense=Tense11|Voice=Voice11|Person=Person11
gra lemma12 pos12 Case=Case12|Numb=Numb12|Deg=Deg12|Mood=Mood12|Tense=Tense12|Voice=Voice12|Person=Person12
-ne ne2 pos13 Case=Case13|Numb=Numb13|Deg=Deg13|Mood=Mood13|Tense=Tense13|Voice=Voice13|Person=Person13
remembrançe lemma14 pos14 Case=Case14|Numb=Numb14|Deg=Deg14|Mood=Mood14|Tense=Tense14|Voice=Voice14|Person=Person14
§ § PUNC MORPH=empty
et lemma15 pos15 Case=Case15|Numb=Numb15|Deg=Deg15|Mood=Mood15|Tense=Tense15|Voice=Voice15|Person=Person15
offerant lemma16 pos16 Case=Case16|Numb=Numb16|Deg=Deg16|Mood=Mood16|Tense=Tense16|Voice=Voice16|Person=Person16
mercé lemma17 pos17 Case=Case17|Numb=Numb17|Deg=Deg17|Mood=Mood17|Tense=Tense17|Voice=Voice17|Person=Person17
, , PUNC MORPH=empty
honor lemma18 pos18 Case=Case18|Numb=Numb18|Deg=Deg18|Mood=Mood18|Tense=Tense18|Voice=Voice18|Person=Person18
et lemma19 pos19 Case=Case19|Numb=Numb19|Deg=Deg19|Mood=Mood19|Tense=Tense19|Voice=Voice19|Person=Person19
celebrançe lemma20 pos20 Case=Case20|Numb=Numb20|Deg=Deg20|Mood=Mood20|Tense=Tense20|Voice=Voice20|Person=Person20
§ § PUNC MORPH=empty
6 changes: 6 additions & 0 deletions tests/test_models/crazy_text_file.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
\\<1>[$@$](V)\\§
\<1>[$@$]\§
§
\[I]\§
En honor et en bien et en gran remembrançe §
Et offerant mercé, honor et celebrançe §

0 comments on commit 2484287

Please sign in to comment.