Try at making token exclusion easier to configure

hipster-philology · Feb 25, 2020 · f278d4a · f278d4a
1 parent efbfc7d
commit f278d4a
Show file tree

Hide file tree

Showing 14 changed files with 194 additions and 89 deletions.
diff --git a/pie_extended/cli/__init__.py b/pie_extended/cli/__init__.py
@@ -1,6 +1,7 @@
 import click
 
 from . import sub
+from typing import Iterable
 
 
 MODELS = [name for name, *_ in sub.get_list()]
@@ -54,8 +55,15 @@ def download(model):
               help="Raise error when a file is not tagged correctly")
 @click.option("--model_path", type=str, default=None,
               help="Provide this with your own model path if you want to test it")
-def tag(model, filepath, allowed_failure, batch_size, device, debug, model_path):
+@click.option("--reset-exclude-patterns", "reset_patterns", is_flag=True, default=False,
+              help="Reset exclude patterns")
+@click.option("--add-pattern", "add_pattern",
+              help="Add new exclude patterns  for token (Regular expression)", multiple=True)
+def tag(model: str, filepath: str, allowed_failure: bool, batch_size: int, device: str, debug: bool,
+        model_path: str,
+        reset_patterns: bool, add_pattern: Iterable[str]):
     """ Tag as many [filepath] as you want with [model] """
+    print(reset_patterns, add_pattern)
     from tqdm import tqdm
     click.echo(click.style("Getting the tagger", bold=True))
     try:
@@ -69,7 +77,8 @@ def tag(model, filepath, allowed_failure, batch_size, device, debug, model_path)
     failures = []
     for file in tqdm(filepath):
         try:
-            sub.tag_file(model, tagger, file)
+            sub.tag_file(model, tagger, file, reset_exclude_patterns=reset_patterns,
+                         exclude_patterns=add_pattern)
         except Exception as E:
             failures.append(E)
             click.echo("{} could not be lemmatized".format(file))

diff --git a/pie_extended/cli/sub.py b/pie_extended/cli/sub.py
@@ -1,5 +1,5 @@
 import os
-from typing import Tuple, Iterable, Generator, Union
+from typing import Tuple, Iterable, List, Union
 from importlib import import_module
 
 import requests
@@ -11,11 +11,20 @@
 from pie.utils import model_spec
 
 
-def get_model(model):
+def get_model(model: str):
+    """ Retrieve a module given a string
+
+    :param model: Module Name
+    :return: Module
+    """
     return import_module("{}.{}".format(models.__name__, model))
 
 
-def download(module) -> Iterable[Union[str, int]]:
+def download(module: str) -> Iterable[Union[str, int]]:
+    """ Download dependencies for the given module
+
+    :param module: Module for which to download models and static files in general
+    """
     lemmatizer = get_model(module)
     os.makedirs(os.path.join(PATH, module), exist_ok=True)
     yield len(lemmatizer.DOWNLOADS)
@@ -30,13 +39,23 @@ def download(module) -> Iterable[Union[str, int]]:
 
 
 def get_list() -> Iterable[Tuple[str, Metadata]]:
+    """ Retrieve a list of available modules
+    """
     for module in models.modules:
         desc = getattr(get_model(module), "DESC", None)
         if desc:
             yield module, desc
 
 
 def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None) -> ExtensibleTagger:
+    """ Retrieve the tagger
+
+    :param model: Module of the tagger
+    :param batch_size: Size of the batch
+    :param device: Device to use (cuda/cpu)
+    :param model_path: Path to the model if you want to override the package one
+    :return: Tagger
+    """
     module = get_model(model)
     disambiguator = getattr(module, "Disambiguator", None)
     if isinstance(disambiguator, ObjectCreator):
@@ -48,9 +67,30 @@ def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None)
     return tagger
 
 
-def tag_file(model: str, tagger: ExtensibleTagger, fpath):
+def tag_file(
+        model: str, tagger: ExtensibleTagger,
+        fpath: str,
+        reset_exclude_patterns: bool = False,
+        exclude_patterns: List[str] = None):
+    """ Tag a file with a given model
+
+    :param model: Module name of the model
+    :param tagger: Tagger that should be used
+    :param fpath: Path to the file to edit
+    :param reset_exclude_patterns: Remove all pre-registered token exclusion regular expressions
+    :param exclude_patterns: New exclude patterns to add to the data iterator (Does not require reset)
+    """
     module = get_model(model)
     iterator, processor = getattr(module, "get_iterator_and_processor")()
+    # Remove first pattern
+    if reset_exclude_patterns:
+        iterator.reset_patterns()
+
+    # Add new
+    if exclude_patterns:
+        for pattern in exclude_patterns:
+            iterator.add_pattern(pattern)
+
     tagger.tag_file(fpath, iterator=iterator, processor=processor)
     return True
 

diff --git a/pie_extended/models/fro/get.py b/pie_extended/models/fro/get.py
@@ -1,6 +1,6 @@
-from .processor import FroRulesProcessor, FroGlueProcessor
+from .processor import FroRulesProcessor, FroGlueProcessor, FroMapProcessor
 from .tokenizer import FroMemorizingTokenizer
-from pie_extended.pipeline.iterators.proto import DataIterator
+from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns
 from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor
 
 
@@ -10,12 +10,14 @@ def get_iterator_and_processor():
         apply_on_reinsert=True,
         head_processor=MemoryzingProcessor(
             tokenizer_memory=tokenizer,
-            head_processor=FroGlueProcessor()
+            head_processor=FroGlueProcessor(
+                head_processor=FroMapProcessor()
+            )
         )
     )
     iterator = DataIterator(
         tokenizer=tokenizer,
-        remove_from_input=DataIterator.remove_punctuation
+        exclude_patterns=[GenericExcludePatterns.Punctuation_and_Underscore]
     )
     return iterator, processor
 
diff --git a/pie_extended/models/fro/processor.py b/pie_extended/models/fro/processor.py
@@ -1,16 +1,17 @@
 import regex as re
-from typing import Dict
+from typing import Dict, Pattern
 
 from pie_extended.pipeline.postprocessor.glue import GlueProcessor
 from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor
+from pie_extended.pipeline.postprocessor.proto import RenamedTaskProcessor
 
 
 class FroRulesProcessor(RuleBasedProcessor):
     """ Fro Dataset has not all punctuation signs in it, we remove it and posttag it automatically
 
     """
-    PONCTU = re.compile(r"^\W+$")
-    NUMBER = re.compile(r"\d+")
+    PONCTU: Pattern = re.compile(r"^\W+$")
+    NUMBER: Pattern = re.compile(r"\d+")
     PONFORT = [".", "...", "!", "?"]
 
     def rules(self, annotation: Dict[str, str]) -> Dict[str, str]:
@@ -35,9 +36,15 @@ class FroGlueProcessor(GlueProcessor):
     """
     OUTPUT_KEYS = ["form", "lemma", "POS", "morph"]
     GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]}
-    MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."}
     EMPTY_TAG: Dict[str, str] = {"CAS": "_", "NOMB.": "_", "DEGRE": "_", "MODE": "_", "TEMPS": "_", "GENRE": "_",
                                  "PERS.": "_"}
 
     def __init__(self, *args, **kwargs):
         super(FroGlueProcessor, self).__init__(*args, **kwargs)
+
+
+class FroMapProcessor(RenamedTaskProcessor):
+    MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."}
+
+    def __init__(self, *args, **kwargs):
+        super(FroMapProcessor, self).__init__(*args, **kwargs)
diff --git a/pie_extended/models/lasla/get.py b/pie_extended/models/lasla/get.py
@@ -1,8 +1,8 @@
 import regex as re
 
-from pie_extended.models.lasla.processor import LatinRulesProcessor, LatinGlueProcessor
+from pie_extended.models.lasla.processor import LatinRulesProcessor, LatinGlueProcessor, LatinMapProcessor
 from pie_extended.models.lasla.tokenizer import LatMemorizingTokenizer
-from pie_extended.pipeline.iterators.proto import DataIterator
+from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns
 from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor
 
 # Uppercase regexp
@@ -15,11 +15,13 @@ def get_iterator_and_processor():
         apply_on_reinsert=True,
         head_processor=MemoryzingProcessor(
             tokenizer_memory=tokenizer,
-            head_processor=LatinGlueProcessor()
+            head_processor=LatinGlueProcessor(
+                LatinMapProcessor()
+            )
         )
     )
     iterator = DataIterator(
         tokenizer=tokenizer,
-        remove_from_input=DataIterator.remove_punctuation
+        exclude_patterns=[GenericExcludePatterns.Punctuation_and_Underscore]
     )
     return iterator, processor
diff --git a/pie_extended/models/lasla/processor.py b/pie_extended/models/lasla/processor.py
@@ -3,6 +3,7 @@
 
 from pie_extended.pipeline.postprocessor.glue import GlueProcessor
 from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor
+from pie_extended.pipeline.postprocessor.proto import RenamedTaskProcessor
 
 
 class LatinRulesProcessor(RuleBasedProcessor):
@@ -33,7 +34,13 @@ class LatinGlueProcessor(GlueProcessor):
     OUTPUT_KEYS = ["form", "lemma", "POS", "morph"]
     GLUE = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]}
     WHEN_EMPTY = {"morph": "MORPH=empty"}
+
+    def __init__(self, *args, **kwargs):
+        super(LatinGlueProcessor, self).__init__(*args, **kwargs)
+
+
+class LatinMapProcessor(RenamedTaskProcessor):
     MAP = {"pos": "POS"}
 
     def __init__(self, *args, **kwargs):
-        super(LatinGlueProcessor, self).__init__(*args, **kwargs)
+        super(LatinMapProcessor, self).__init__(*args, **kwargs)
diff --git a/pie_extended/pipeline/iterators/proto.py b/pie_extended/pipeline/iterators/proto.py
@@ -1,50 +1,97 @@
 import regex as re
 
-from pie.tagger import simple_tokenizer
-from typing import Callable, List, Tuple, Dict, Union, Iterable
+from typing import List, Tuple, Dict, Iterable, Pattern, Union
 
-from ...utils import ObjectCreator
-from ..tokenizers.simple_tokenizer import SimpleTokenizer
+from pie_extended.pipeline.tokenizers.simple_tokenizer import SimpleTokenizer
+from enum import Enum
 
-Remover = Callable[[List[str]], Tuple[List[str], Dict[int, str]]]
-PUNKT = re.compile(r"^[_||[^\s\w]]+$", re.VERSION1)
+
+class GenericExcludePatterns(Enum):
+    """ Useful set of regular expresion that can be used for the exclude_patterns
+
+    """
+    Punctuation_and_Underscore: Pattern = re.compile(r"^[_||[^\s\w]]+$", re.VERSION1)
+    Punctuation: Pattern = re.compile(r"^[^\s\w]+$")
+    PassageMarker: Pattern = re.compile(r"_Passage_[\w\d_]+")  # Use `_` as a joining character
 
 
 class DataIterator:
-    def __init__(self, tokenizer: SimpleTokenizer = None, remove_from_input: Callable = None):
+    def __init__(self, tokenizer: SimpleTokenizer = None, exclude_patterns: List[Union[str, Pattern]] = None):
         """ Iterator used to parse the text and returns bits to tag
 
         :param tokenizer: Tokenizer
         """
         self.tokenizer: SimpleTokenizer = tokenizer or SimpleTokenizer()
-        self.remove_from_input = remove_from_input
-        if self.remove_from_input is None:
-            self.remove_from_input = lambda x: (x, {})
+        self.exclude_patterns: List[Pattern] = []
+        if exclude_patterns:
+            for pattern in exclude_patterns:
+                self.add_pattern(pattern)
+
+    def add_pattern(self, pattern: str):
+        """ Add a pattern for removal
+
+        :param pattern: Pattern for token removal
+        """
+        if isinstance(pattern, str):
+            self.exclude_patterns.append(re.compile(pattern))
+        elif hasattr(pattern, "value"):  # Deal with enum
+            self.exclude_patterns.append(pattern.value)
+        else:
+            self.exclude_patterns.append(pattern)
+
+    def reset_patterns(self) -> None:
+        """ Removes removal patterns
+
+        >>> x = DataIterator(exclude_patterns=[r'\W+'])
+        >>> x.exclude_tokens(["Je", "suis", "content", ",", "mais", "...", '"', "fatigué", '"', "."])
+        (['Je', 'suis', 'content', 'mais', 'fatigué'], {3: ',', 5: '...', 6: '"', 8: '"', 9: '.'})
+        >>> x.reset_patterns()
+        >>> x.exclude_tokens(["Je", "suis", "content", ",", "mais", "...", '"', "fatigué", '"', "."])
+        (['Je', 'suis', 'content', ',', 'mais', '...', '"', 'fatigué', '"', '.'], {})
+        """
+        self.exclude_patterns = []
 
-    @staticmethod
-    def remove_punctuation(sentence: List[str]) -> Tuple[List[str], Dict[int, str]]:
+    def exclude_tokens(self, sentence: List[str]) -> Tuple[List[str], Dict[int, str]]:
         """ Removes punctuation from a list and keeps its index
 
         :param sentence:
         :return: First the sentence with things removed, then a dictionary whose keys are index of token to reinsert and
         associated values are punctuation to reinsert.
 
-        >>> x = DataIterator.remove_punctuation(["Je", "suis", "content",",", "mais", "...", '"', "fatigué", '"', "."])
-        >>> assert x == (['Je', 'suis', 'content', 'mais', 'fatigué'], {3: ',', 5: '...', 6: '"', 8: '"', 9: '.'})
+        You can use string when generating the exclude_pattern
+
+        >>> x = DataIterator(exclude_patterns=[r'\W+'])
+        >>> x.exclude_tokens(["Je", "suis", "content",",", "mais", "...", '"', "fatigué", '"', "."])
+        (['Je', 'suis', 'content', 'mais', 'fatigué'], {3: ',', 5: '...', 6: '"', 8: '"', 9: '.'})
+
+        Pre-built removers:
+
+        >>> x = DataIterator(exclude_patterns=[GenericExcludePatterns.PassageMarker])
+        >>> x.exclude_tokens(["_Passage_45_78", "Ici", "commence", "le", "passage"])
+        (['Ici', 'commence', 'le', 'passage'], {0: '_Passage_45_78'})
+
+        And of course you can ignore this option
+
+        >>> x = DataIterator()
+        >>> x.exclude_tokens(["_Passage_45_78", "Ici", "commence", "le", "passage"])
+        (['_Passage_45_78', 'Ici', 'commence', 'le', 'passage'], {})
+
         """
+        if len(self.exclude_patterns) == 0:
+            return sentence, {}
+
         clean, removed = [], {}
         for index, token in enumerate(sentence):
-            if PUNKT.match(token):
-                removed[index] = token
-            else:
+            match = False
+            for exclude_pattern in self.exclude_patterns:
+                if exclude_pattern.match(token):
+                    removed[index] = token
+                    match = True
+                    break
+            if not match:
                 clean.append(token)
         return clean, removed
 
-    def get_remover(self) -> Remover:
-        if isinstance(self.remove_from_input, ObjectCreator):
-            return self.remove_from_input.create()
-        return self.remove_from_input
-
     def __call__(self, data: str, lower: bool = False) -> Iterable[Tuple[List[str], int, Dict[int, str]]]:
         """ Default iter data takes a text, an option to make lower
         and yield lists of words along with the length of the list
@@ -53,7 +100,6 @@ def __call__(self, data: str, lower: bool = False) -> Iterable[Tuple[List[str],
         :param lower: Whether or not to lower the text
         :yields: (Sentence as a list of word, Size of the sentence, Elements removed from the sentence)
         """
-        remover = self.get_remover()
         for sentence in self.tokenizer.sentence_tokenizer(data, lower=lower):
-            clean_sentence, removed_from_input = remover(sentence)
+            clean_sentence, removed_from_input = self.exclude_tokens(sentence)
             yield clean_sentence, len(clean_sentence), removed_from_input
diff --git a/pie_extended/pipeline/postprocessor/disambiguator.py b/pie_extended/pipeline/postprocessor/disambiguator.py