Merge pull request #9 from hipster-philology/postprocessors

Postprocessors
hipster-philology · Feb 25, 2020 · 39dc39b · 39dc39b
2 parents e5e1284 + 57e6188
commit 39dc39b
Show file tree

Hide file tree

Showing 30 changed files with 1,240 additions and 501 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 .idea
 /*.txt
 pie_extended/downloads/*
+tests/**/*.txt
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/.travis.yml b/.travis.yml
@@ -12,7 +12,8 @@ install:
 # command to run tests
 script:
   - pie-extended install-addons lasla
-  - nosetests ./tests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture
+  - pie-extended download lasla
+  - nosetests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture --with-doctest
 after_success:
   - coverage combine
   - coveralls
diff --git a/README.md b/README.md
@@ -16,6 +16,12 @@ The current system provide an easier access to adding **customized**:
 - disambiguation,
 - output formatting
 
+## Install
+
+To install, simply do `pip install pie-extended`. Then, look at all available models.
+
+## Run on terminal
+
 But on top of that, it provides a quick and easy way to use others models ! For example, in a shell :
 
 ```bash
@@ -26,6 +32,53 @@ pie-extended tag laslsa your_file.txt
 
 will give you access to all you need !
 
+## Python API
+
+You can run the lemmatizer in your own scripts and retrieve token annotations as dictionaries:
+
+```python
+from typing import List
+from pie_extended.cli.sub import get_tagger, get_model, download
+
+# In case you need to download
+do_download = False
+if do_download:
+    for dl in download("lasla"):
+        x = 1
+
+# model_path allows you to override the model loaded by another .tar
+model_name = "lasla"
+tagger = get_tagger(model_name, batch_size=256, device="cpu", model_path=None)
+
+sentences: List[str] = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit. "]
+# Get the main object from the model (: data iterator + postprocesor
+from pie_extended.models.lasla import get_iterator_and_processor
+for sentence_group in sentences:
+    iterator, processor = get_iterator_and_processor()
+    print(tagger.tag_str(sentence_group, iterator=iterator, processor=processor) )
+```
+
+will result in
+
+```python
+[{'form': 'lorem', 'lemma': 'lor', 'POS': 'NOMcom', 'morph': 'Case=Acc|Numb=Sing', 'treated': 'lorem'},
+ {'form': 'ipsum', 'lemma': 'ipse', 'POS': 'PROdem', 'morph': 'Case=Acc|Numb=Sing', 'treated': 'ipsum'},
+ {'form': 'dolor', 'lemma': 'dolor', 'POS': 'NOMcom', 'morph': 'Case=Nom|Numb=Sing', 'treated': 'dolor'},
+ {'form': 'sit', 'lemma': 'sum1', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Act|Person=3',
+  'treated': 'sit'},
+ {'form': 'amet', 'lemma': 'amo', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Act|Person=3',
+  'treated': 'amet'}, {'form': ',', 'lemma': ',', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': ','},
+ {'form': 'consectetur', 'lemma': 'consector2', 'POS': 'VER',
+  'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Dep|Person=3', 'treated': 'consectetur'},
+ {'form': 'adipiscing', 'lemma': 'adipiscor', 'POS': 'VER', 'morph': 'Tense=Pres|Voice=Dep', 'treated': 'adipiscing'},
+ {'form': 'elit', 'lemma': 'elio', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Ind|Tense=Pres|Voice=Act|Person=3',
+  'treated': 'elit'}, {'form': '.', 'lemma': '.', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '.'}]
+```
+
+## Add a model
+
+ToDo: Documentation
+
 ## Warning
 
 This is an extremely early build, subject to change here and there. But it is functional !
diff --git a/pie_extended/cli/__init__.py b/pie_extended/cli/__init__.py
@@ -1,6 +1,7 @@
 import click
 
 from . import sub
+from typing import Iterable
 
 
 MODELS = [name for name, *_ in sub.get_list()]
@@ -54,15 +55,29 @@ def download(model):
               help="Raise error when a file is not tagged correctly")
 @click.option("--model_path", type=str, default=None,
               help="Provide this with your own model path if you want to test it")
-def tag(model, filepath, allowed_failure, batch_size, device, debug, model_path):
+@click.option("--reset-exclude-patterns", "reset_patterns", is_flag=True, default=False,
+              help="Reset exclude patterns")
+@click.option("--add-pattern", "add_pattern",
+              help="Add new exclude patterns  for token (Regular expression)", multiple=True)
+def tag(model: str, filepath: str, allowed_failure: bool, batch_size: int, device: str, debug: bool,
+        model_path: str,
+        reset_patterns: bool, add_pattern: Iterable[str]):
     """ Tag as many [filepath] as you want with [model] """
     from tqdm import tqdm
     click.echo(click.style("Getting the tagger", bold=True))
-    tagger = sub.get_tagger(model, batch_size=batch_size, device=device, model_path=model_path)
+    try:
+        tagger = sub.get_tagger(model, batch_size=batch_size, device=device, model_path=model_path)
+    except FileNotFoundError as e:
+        click.echo("Model not found: please make sure you have downloaded the model files with "
+                   "pie-extended download " + model)
+        if debug:
+            raise e
+        return
     failures = []
     for file in tqdm(filepath):
         try:
-            sub.tag_file(model, tagger, file)
+            sub.tag_file(model, tagger, file, reset_exclude_patterns=reset_patterns,
+                         exclude_patterns=add_pattern)
         except Exception as E:
             failures.append(E)
             click.echo("{} could not be lemmatized".format(file))

diff --git a/pie_extended/cli/sub.py b/pie_extended/cli/sub.py
@@ -1,5 +1,5 @@
 import os
-from typing import Tuple, Iterable, Generator, Union
+from typing import Tuple, Iterable, List, Union
 from importlib import import_module
 
 import requests
@@ -11,11 +11,20 @@
 from pie.utils import model_spec
 
 
-def get_model(model):
+def get_model(model: str):
+    """ Retrieve a module given a string
+
+    :param model: Module Name
+    :return: Module
+    """
     return import_module("{}.{}".format(models.__name__, model))
 
 
-def download(module) -> Iterable[Union[str, int]]:
+def download(module: str) -> Iterable[Union[str, int]]:
+    """ Download dependencies for the given module
+
+    :param module: Module for which to download models and static files in general
+    """
     lemmatizer = get_model(module)
     os.makedirs(os.path.join(PATH, module), exist_ok=True)
     yield len(lemmatizer.DOWNLOADS)
@@ -30,13 +39,23 @@ def download(module) -> Iterable[Union[str, int]]:
 
 
 def get_list() -> Iterable[Tuple[str, Metadata]]:
+    """ Retrieve a list of available modules
+    """
     for module in models.modules:
         desc = getattr(get_model(module), "DESC", None)
         if desc:
             yield module, desc
 
 
 def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None) -> ExtensibleTagger:
+    """ Retrieve the tagger
+
+    :param model: Module of the tagger
+    :param batch_size: Size of the batch
+    :param device: Device to use (cuda/cpu)
+    :param model_path: Path to the model if you want to override the package one
+    :return: Tagger
+    """
     module = get_model(model)
     disambiguator = getattr(module, "Disambiguator", None)
     if isinstance(disambiguator, ObjectCreator):
@@ -48,10 +67,31 @@ def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None)
     return tagger
 
 
-def tag_file(model: str, tagger: ExtensibleTagger, fpath):
+def tag_file(
+        model: str, tagger: ExtensibleTagger,
+        fpath: str,
+        reset_exclude_patterns: bool = False,
+        exclude_patterns: List[str] = None):
+    """ Tag a file with a given model
+
+    :param model: Module name of the model
+    :param tagger: Tagger that should be used
+    :param fpath: Path to the file to edit
+    :param reset_exclude_patterns: Remove all pre-registered token exclusion regular expressions
+    :param exclude_patterns: New exclude patterns to add to the data iterator (Does not require reset)
+    """
     module = get_model(model)
-    iterator, formatter = getattr(module, "get_iterator_and_formatter")()
-    tagger.tag_file(fpath, iterator=iterator, formatter_class=formatter)
+    iterator, processor = getattr(module, "get_iterator_and_processor")()
+    # Remove first pattern
+    if reset_exclude_patterns:
+        iterator.reset_patterns()
+
+    # Add new
+    if exclude_patterns:
+        for pattern in exclude_patterns:
+            iterator.add_pattern(pattern)
+
+    tagger.tag_file(fpath, iterator=iterator, processor=processor)
     return True
 
 

diff --git a/pie_extended/models/fro/__init__.py b/pie_extended/models/fro/__init__.py
@@ -1,5 +1,5 @@
-from ...utils import Metadata, File ,get_path
-from .classes import get_iterator_and_formatter
+from ...utils import Metadata, File, get_path
+from .get import get_iterator_and_processor
 from ...pipeline.iterators.proto import DataIterator
 
 DESC = Metadata(

diff --git a/pie_extended/models/fro/classes.py b/pie_extended/models/fro/classes.py